summaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp24
-rw-r--r--llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Features.td7
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp41
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp171
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h9
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp324
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td14
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64Processors.td18
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp5
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td18
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td21
-rw-r--r--llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64SystemOperands.td5
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp113
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h8
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp8
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp6
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp11
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td33
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp294
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp119
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp90
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp93
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp60
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp108
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp41
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp234
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp42
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td504
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp46
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h10
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp266
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp508
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp258
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h29
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td14
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td73
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td21
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td42
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td19
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td29
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td392
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td26
-rw-r--r--llvm/lib/Target/ARM/ARMAsmPrinter.cpp6
-rw-r--r--llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMFastISel.cpp11
-rw-r--r--llvm/lib/Target/ARM/ARMFeatures.td6
-rw-r--r--llvm/lib/Target/ARM/ARMFrameLowering.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp17
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h2
-rw-r--r--llvm/lib/Target/ARM/ARMInstrInfo.td25
-rw-r--r--llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMPredicates.td4
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.cpp9
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.h1
-rw-r--r--llvm/lib/Target/ARM/ARMTargetMachine.cpp5
-rw-r--r--llvm/lib/Target/ARM/ARMTargetObjectFile.cpp1
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp7
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h6
-rw-r--r--llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp39
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h3
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp19
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp6
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp58
-rw-r--r--llvm/lib/Target/AVR/README.md2
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp2
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp11
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h3
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/DirectX/DXILDataScalarization.cpp9
-rw-r--r--llvm/lib/Target/DirectX/DXILFlattenArrays.cpp308
-rw-r--r--llvm/lib/Target/DirectX/DXILLegalizePass.cpp74
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp109
-rw-r--r--llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp9
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLowering.cpp14
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLowering.h3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPatterns.td5
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp19
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp34
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp19
-rw-r--r--llvm/lib/Target/Lanai/LanaiFrameLowering.cpp12
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp7
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp10
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td93
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td58
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp43
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h8
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp2
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h31
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp9
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp274
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp12
-rw-r--r--llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp33
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.h2
-rw-r--r--llvm/lib/Target/Mips/MipsBranchExpansion.cpp27
-rw-r--r--llvm/lib/Target/Mips/MipsCallingConv.td11
-rw-r--r--llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp13
-rw-r--r--llvm/lib/Target/Mips/MipsInstrFPU.td18
-rw-r--r--llvm/lib/Target/Mips/MipsInstrInfo.td1
-rw-r--r--llvm/lib/Target/Mips/MipsRegisterInfo.cpp7
-rw-r--r--llvm/lib/Target/Mips/MipsSEFrameLowering.cpp6
-rw-r--r--llvm/lib/Target/Mips/MipsSubtarget.h1
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp24
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTX.h4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp51
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h5
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp331
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td140
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td196
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXUtilities.h2
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp7
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp9
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp12
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp3
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp103
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h11
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h12
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp5
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h4
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp42
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td15
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp152
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp175
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp104
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h15
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrFormats.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp7
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td54
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td12
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td35
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td168
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp436
-rw-r--r--llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp32
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedAndes45.td9
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td532
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp26
-rw-r--r--llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp27
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td27
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp2
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp1
-rw-r--r--llvm/lib/Target/Sparc/Sparc.td7
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp20
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrCrypto.td98
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.td5
-rw-r--r--llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp1
-rw-r--r--llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp2
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp6
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.h5
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp6
-rw-r--r--llvm/lib/Target/TargetLoweringObjectFile.cpp29
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp2
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp1
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp20
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h1
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp67
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp6
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h2
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/X86/X86CallFrameOptimization.cpp6
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.cpp31
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.td5
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp3
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp8
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp4
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp9
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h2
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp15
-rw-r--r--llvm/lib/Target/X86/X86InstrPredicates.td2
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp7
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp5
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h11
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp6
-rw-r--r--llvm/lib/Target/X86/X86WinEHState.cpp4
-rw-r--r--llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp2
242 files changed, 5952 insertions, 3282 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 36f3a670808d..12fc976a70ea 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -598,6 +598,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
llvm_unreachable("Unsupported ElementSize");
}
+ // Preserve undef state until DOP's reg is defined.
+ unsigned DOPRegState = MI.getOperand(DOPIdx).isUndef() ? RegState::Undef : 0;
+
//
// Create the destructive operation (if required)
//
@@ -616,10 +619,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero))
.addReg(DstReg, RegState::Define)
.addReg(MI.getOperand(PredIdx).getReg())
- .addReg(MI.getOperand(DOPIdx).getReg());
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
// After the movprfx, the destructive operand is same as Dst
DOPIdx = 0;
+ DOPRegState = 0;
// Create the additional LSL to zero the lanes when the DstReg is not
// unique. Zeros the lanes in z0 that aren't active in p0 with sequence
@@ -638,8 +642,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
assert(DOPRegIsUnique && "The destructive operand should be unique");
PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
.addReg(DstReg, RegState::Define)
- .addReg(MI.getOperand(DOPIdx).getReg());
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
DOPIdx = 0;
+ DOPRegState = 0;
}
//
@@ -647,10 +652,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
//
DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode))
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
+ DOPRegState = DOPRegState | RegState::Kill;
switch (DType) {
case AArch64::DestructiveUnaryPassthru:
- DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
.add(MI.getOperand(PredIdx))
.add(MI.getOperand(SrcIdx));
break;
@@ -659,20 +665,20 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
case AArch64::DestructiveBinaryComm:
case AArch64::DestructiveBinaryCommWithRev:
DOP.add(MI.getOperand(PredIdx))
- .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
- .add(MI.getOperand(SrcIdx));
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
+ .add(MI.getOperand(SrcIdx));
break;
case AArch64::DestructiveTernaryCommWithRev:
DOP.add(MI.getOperand(PredIdx))
- .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+ .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
.add(MI.getOperand(SrcIdx))
.add(MI.getOperand(Src2Idx));
break;
}
if (PRFX) {
- finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
transferImpOps(MI, PRFX, DOP);
+ finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
} else
transferImpOps(MI, DOP, DOP);
@@ -1591,18 +1597,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
"Non-writeback variants of STGloop / STZGloop should not "
"survive past PrologEpilogInserter.");
case AArch64::STR_ZZZZXI:
+ case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
case AArch64::STR_ZZZXI:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
case AArch64::STR_ZZXI:
+ case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
case AArch64::STR_PPXI:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
case AArch64::LDR_ZZZZXI:
+ case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
case AArch64::LDR_ZZZXI:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
case AArch64::LDR_ZZXI:
+ case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
case AArch64::LDR_PPXI:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 83804b4b09bc..21756177fc74 100644
--- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -812,7 +812,7 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
if (skipFunction(Fn.getFunction()))
return false;
- TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+ TII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
MachineLoopInfo &LI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 9973df865ea1..c1c1f0a1024d 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -840,6 +840,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
"HasDisableFastIncVL", "true",
"Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
+// On most processors we want to avoid moving from WZR to vector registers
+// (relying on materializing 0 to a FPR and moving from there instead),
+// but on some (in-order) cores it's preferable to avoid the extra instruction instead.
+def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
+ "UseWzrToVecMove", "true",
+ "Move from WZR to insert 0 into vector registers">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 666ff8bbab42..885f2a94f85f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -731,8 +731,7 @@ void AArch64FrameLowering::resetCFIToInitialState(
MachineFunction &MF = *MBB.getParent();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
- const auto &TRI =
- static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo());
+ const auto &TRI = *Subtarget.getRegisterInfo();
const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
CFIInstBuilder CFIBuilder(MBB, MBB.begin(), MachineInstr::NoFlags);
@@ -1746,7 +1745,7 @@ static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL) {
+ const DebugLoc &DL, bool NeedsWinCFI) {
// Shadow call stack epilog: ldr x30, [x18, #-8]!
BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
.addReg(AArch64::X18, RegState::Define)
@@ -1755,6 +1754,10 @@ static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
.addImm(-8)
.setMIFlag(MachineInstr::FrameDestroy);
+ if (NeedsWinCFI)
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
+ .setMIFlag(MachineInstr::FrameDestroy);
+
if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF))
CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
.buildRestore(AArch64::X18);
@@ -1899,13 +1902,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE))
.setMIFlag(MachineInstr::FrameSetup);
}
- if (NeedsWinCFI)
- HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR
+ // AArch64PointerAuth pass will insert SEH_PACSignLR
+ HasWinCFI |= NeedsWinCFI;
}
- if (MFnI.needsShadowCallStackPrologueEpilogue(MF))
+ if (MFnI.needsShadowCallStackPrologueEpilogue(MF)) {
emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
MFnI.needsDwarfUnwindInfo(MF));
+ HasWinCFI |= NeedsWinCFI;
+ }
if (EmitCFI && MFnI.isMTETagged()) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
@@ -1990,8 +1995,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
"unexpected function without stack frame but with SVE objects");
// All of the stack allocation is for locals.
AFI->setLocalStackSize(NumBytes);
- if (!NumBytes)
+ if (!NumBytes) {
+ if (NeedsWinCFI && HasWinCFI) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
return;
+ }
// REDZONE: If the stack size is less than 128 bytes, we don't need
// to actually allocate.
if (canUseRedZone(MF)) {
@@ -2460,8 +2470,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock::iterator EpilogStartI = MBB.end();
auto FinishingTouches = make_scope_exit([&]() {
- if (AFI->needsShadowCallStackPrologueEpilogue(MF))
- emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL);
+ if (AFI->needsShadowCallStackPrologueEpilogue(MF)) {
+ emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL,
+ NeedsWinCFI);
+ HasWinCFI |= NeedsWinCFI;
+ }
if (EmitCFI)
emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
if (AFI->shouldSignReturnAddress(MF)) {
@@ -2472,8 +2485,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
TII->get(AArch64::PAUTH_EPILOGUE))
.setMIFlag(MachineInstr::FrameDestroy);
}
- if (NeedsWinCFI)
- HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR
+ // AArch64PointerAuth pass will insert SEH_PACSignLR
+ HasWinCFI |= NeedsWinCFI;
}
if (HasWinCFI) {
BuildMI(MBB, MBB.getFirstTerminator(), DL,
@@ -3030,9 +3043,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
ObjectOffset);
if (FPAfterSVECalleeSaves) {
- assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() &&
- "Math isn't correct for CSRs with FPAfterSVECalleeSaves");
FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
+ if (-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) {
+ FPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize());
+ SPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize());
+ }
}
// Always use the FP for SVE spills if available and beneficial.
if (hasFP(MF) && (SPOffset.getFixed() ||
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f7de61f044a7..f026726c3f48 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1143,6 +1143,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
ISD::STORE, ISD::BUILD_VECTOR});
+ setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::LOAD);
@@ -2392,6 +2393,15 @@ static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
return false;
}
+bool isVectorizedBinOp(unsigned Opcode) {
+ switch (Opcode) {
+ case AArch64ISD::SQDMULH:
+ return true;
+ default:
+ return false;
+ }
+}
+
// isOpcWithIntImmediate - This method tests to see if the node is a specific
// opcode and that it has a immediate integer right operand.
// If so Imm will receive the value.
@@ -2600,6 +2610,12 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
break;
}
+ case AArch64ISD::MOVIshift: {
+ Known = KnownBits::makeConstant(
+ APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
+ << Op->getConstantOperandVal(1)));
+ break;
+ }
case AArch64ISD::LOADgot:
case AArch64ISD::ADDlow: {
if (!Subtarget->isTargetILP32())
@@ -5512,7 +5528,8 @@ static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG,
unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
- if (!isa<ConstantSDNode>(N->getOperand(Op1)))
+ if (!N->getValueType(0).isScalableVector() ||
+ !isa<ConstantSDNode>(N->getOperand(Op1)))
return SDValue();
SDLoc DL(N);
@@ -6422,7 +6439,9 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
}
}
- return true;
+ EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
+ return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
+ PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
}
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
@@ -17138,7 +17157,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -17146,6 +17165,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
const DataLayout &DL = LI->getDataLayout();
VectorType *VTy = Shuffles[0]->getType();
@@ -17469,16 +17493,18 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
- unsigned Factor = DeinterleavedValues.size();
+ Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+ const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
if (Factor != 2 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
return false;
}
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load\n");
- Value *FirstActive = *llvm::find_if(DeinterleavedValues,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
+ VectorType *VTy = getDeinterleavedVectorType(DI);
const DataLayout &DL = LI->getModule()->getDataLayout();
bool UseScalable;
@@ -17506,6 +17532,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
Value *BaseAddr = LI->getPointerOperand();
+ Value *Result = nullptr;
if (NumLoads > 1) {
// Create multiple legal small ldN.
SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
@@ -17526,35 +17553,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
}
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned J = 0; J < Factor; ++J) {
- if (DeinterleavedValues[J])
- DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
- }
+
+ // Merge the values from different factors.
+ Result = PoisonValue::get(DI->getType());
+ for (unsigned J = 0; J < Factor; ++J)
+ Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
} else {
- Value *Result;
if (UseScalable)
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
- // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
- for (unsigned I = 0; I < Factor; I++) {
- if (DeinterleavedValues[I]) {
- Value *NewExtract = Builder.CreateExtractValue(Result, I);
- DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
- }
- }
}
+
+ // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+ DI->replaceAllUsesWith(Result);
return true;
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleavedValues) const {
+ Instruction *Store, Value *Mask,
+ ArrayRef<Value *> InterleavedValues) const {
unsigned Factor = InterleavedValues.size();
if (Factor != 2 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
return false;
}
+ StoreInst *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!Mask && "Unexpected mask on plain store");
VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
const DataLayout &DL = SI->getModule()->getDataLayout();
@@ -20119,8 +20146,9 @@ static SDValue performConcatVectorsCombine(SDNode *N,
// size, combine into an binop of two contacts of the source vectors. eg:
// concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
- DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
- N1->hasOneUse()) {
+ (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
+ isVectorizedBinOp(N0Opc)) &&
+ N0->hasOneUse() && N1->hasOneUse()) {
SDValue N00 = N0->getOperand(0);
SDValue N01 = N0->getOperand(1);
SDValue N10 = N1->getOperand(0);
@@ -20979,6 +21007,98 @@ static SDValue performBuildVectorCombine(SDNode *N,
return SDValue();
}
+// A special combine for the sqdmulh family of instructions.
+// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
+// SATURATING_VAL ) can be reduced to sqdmulh(...)
+static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
+
+ if (N->getOpcode() != ISD::SMIN)
+ return SDValue();
+
+ EVT DestVT = N->getValueType(0);
+
+ if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
+ DestVT.isScalableVector())
+ return SDValue();
+
+ ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
+
+ if (!Clamp)
+ return SDValue();
+
+ MVT ScalarType;
+ unsigned ShiftAmt = 0;
+ switch (Clamp->getSExtValue()) {
+ case (1ULL << 15) - 1:
+ ScalarType = MVT::i16;
+ ShiftAmt = 16;
+ break;
+ case (1ULL << 31) - 1:
+ ScalarType = MVT::i32;
+ ShiftAmt = 32;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue Sra = N->getOperand(0);
+ if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
+ return SDValue();
+
+ ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
+ if (!RightShiftVec)
+ return SDValue();
+ unsigned SExtValue = RightShiftVec->getSExtValue();
+
+ if (SExtValue != (ShiftAmt - 1))
+ return SDValue();
+
+ SDValue Mul = Sra.getOperand(0);
+ if (Mul.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ SDValue SExt0 = Mul.getOperand(0);
+ SDValue SExt1 = Mul.getOperand(1);
+
+ if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
+ SExt1.getOpcode() != ISD::SIGN_EXTEND)
+ return SDValue();
+
+ EVT SExt0Type = SExt0.getOperand(0).getValueType();
+ EVT SExt1Type = SExt1.getOperand(0).getValueType();
+
+ if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
+ SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
+ SExt0Type.getVectorNumElements() == 1)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue V0 = SExt0.getOperand(0);
+ SDValue V1 = SExt1.getOperand(0);
+
+ // Ensure input vectors are extended to legal types
+ if (SExt0Type.getFixedSizeInBits() < 64) {
+ unsigned VecNumElements = SExt0Type.getVectorNumElements();
+ EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
+ VecNumElements);
+ V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
+ V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
+ }
+
+ SDValue SQDMULH =
+ DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
+
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
+}
+
+static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG) {
+ if (SDValue V = trySQDMULHCombine(N, DAG)) {
+ return V;
+ }
+
+ return SDValue();
+}
+
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
@@ -26730,6 +26850,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performAddSubCombine(N, DCI);
case ISD::BUILD_VECTOR:
return performBuildVectorCombine(N, DCI, DAG);
+ case ISD::SMIN:
+ return performSMINCombine(N, DAG);
case ISD::TRUNCATE:
return performTruncateCombine(N, DAG, DCI);
case AArch64ISD::ANDS:
@@ -30286,6 +30408,7 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
return Op.getOpcode() == AArch64ISD::DUP ||
Op.getOpcode() == AArch64ISD::MOVI ||
+ Op.getOpcode() == AArch64ISD::MOVIshift ||
(Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
TargetLowering::isTargetCanonicalConstantNode(Op);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 65fe08e92c23..713793ec77da 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -211,18 +211,19 @@ public:
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+ IntrinsicInst *DI) const override;
bool lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
+ Instruction *Store, Value *Mask,
+ ArrayRef<Value *> InterleaveValues) const override;
bool isLegalAddImmediate(int64_t) const override;
bool isLegalAddScalableImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c1474773faa7..bc57537ad5df 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -2482,8 +2484,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::LDR_PXI:
case AArch64::LDR_ZXI:
case AArch64::LDR_ZZXI:
+ case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDR_ZZZXI:
case AArch64::LDR_ZZZZXI:
+ case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDRBBui:
case AArch64::LDRBui:
case AArch64::LDRDui:
@@ -2525,8 +2529,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::STR_PXI:
case AArch64::STR_ZXI:
case AArch64::STR_ZZXI:
+ case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
case AArch64::STR_ZZZXI:
case AArch64::STR_ZZZZXI:
+ case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
case AArch64::STRBBui:
case AArch64::STRBui:
case AArch64::STRDui:
@@ -4318,7 +4324,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
break;
// SVE
case AArch64::STR_ZZZZXI:
+ case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDR_ZZZZXI:
+ case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
Scale = TypeSize::getScalable(16);
Width = TypeSize::getScalable(16 * 4);
MinOffset = -256;
@@ -4332,7 +4340,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
MaxOffset = 253;
break;
case AArch64::STR_ZZXI:
+ case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDR_ZZXI:
+ case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
Scale = TypeSize::getScalable(16);
Width = TypeSize::getScalable(16 * 2);
MinOffset = -256;
@@ -5559,8 +5569,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Twov2d;
Offset = false;
- } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
- AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected register store without SVE store instructions");
+ Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
+ StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZXI;
@@ -5584,8 +5598,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Fourv2d;
Offset = false;
- } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
- AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected register store without SVE store instructions");
+ Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
+ StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZZZXI;
@@ -5736,8 +5754,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Twov2d;
Offset = false;
- } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
- AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected register load without SVE load instructions");
+ Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
+ StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZXI;
@@ -5761,8 +5783,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Fourv2d;
Offset = false;
- } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
- AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected register load without SVE load instructions");
+ Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
+ StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZZZXI;
@@ -6264,13 +6290,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
// LDRWui %0:sub_32<def,read-undef>, %stack.0
//
if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
- const TargetRegisterClass *FillRC;
+ const TargetRegisterClass *FillRC = nullptr;
switch (DstMO.getSubReg()) {
default:
- FillRC = nullptr;
break;
case AArch64::sub_32:
- FillRC = &AArch64::GPR32RegClass;
+ if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
+ FillRC = &AArch64::GPR32RegClass;
break;
case AArch64::ssub:
FillRC = &AArch64::FPR32RegClass;
@@ -7327,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return true;
} // end switch (Pattern)
return false;
@@ -7367,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
+static bool getGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns,
+ unsigned LoadLaneOpCode, unsigned NumLanes) {
+ const MachineFunction *MF = Root.getMF();
+
+ // Early exit if optimizing for size.
+ if (MF->getFunction().hasMinSize())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+ // The root of the pattern must load into the last lane of the vector.
+ if (Root.getOperand(2).getImm() != NumLanes - 1)
+ return false;
+
+ // Check that we have load into all lanes except lane 0.
+ // For each load we also want to check that:
+ // 1. It has a single non-debug use (since we will be replacing the virtual
+ // register)
+ // 2. That the addressing mode only uses a single offset register.
+ auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+ SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
+ while (!RemainingLanes.empty() && CurrInstr &&
+ CurrInstr->getOpcode() == LoadLaneOpCode &&
+ MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+ CurrInstr->getNumOperands() == 4) {
+ RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ if (!RemainingLanes.empty())
+ return false;
+
+ // Match the SUBREG_TO_REG sequence.
+ if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+ return false;
+
+ // Verify that the subreg to reg loads an integer into the first lane.
+ auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+ unsigned SingleLaneSizeInBits = 128 / NumLanes;
+ if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+ return false;
+
+ // Verify that it also has a single non debug use.
+ if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+ return false;
+
+ switch (NumLanes) {
+ case 4:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+ break;
+ case 8:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+ break;
+ case 16:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+ break;
+ default:
+ llvm_unreachable("Got bad number of lanes for gather pattern.");
+ }
+
+ return true;
+}
+
+/// Search for patterns where we use LD1 instructions to load into
+/// separate lanes of an 128 bit Neon register. We can increase Memory Level
+/// Parallelism by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+
+ // The pattern searches for loads into single lanes.
+ switch (Root.getOpcode()) {
+ case AArch64::LD1i32:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+ case AArch64::LD1i16:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+ case AArch64::LD1i8:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+ default:
+ return false;
+ }
+}
+
+static void
+generateGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+ unsigned Pattern, unsigned NumLanes) {
+
+ MachineFunction &MF = *Root.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+ // Gather the initial load instructions to build the pattern
+ SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+ MachineInstr *CurrInstr = &Root;
+ for (unsigned i = 0; i < NumLanes - 1; ++i) {
+ LoadToLaneInstrs.push_back(CurrInstr);
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ // Sort the load instructions according to the lane.
+ llvm::sort(LoadToLaneInstrs,
+ [](const MachineInstr *A, const MachineInstr *B) {
+ return A->getOperand(2).getImm() > B->getOperand(2).getImm();
+ });
+
+ MachineInstr *SubregToReg = CurrInstr;
+ LoadToLaneInstrs.push_back(
+ MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+ auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+ const TargetRegisterClass *FPR128RegClass =
+ MRI.getRegClass(Root.getOperand(0).getReg());
+
+ auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+ Register SrcRegister, unsigned Lane,
+ Register OffsetRegister) {
+ auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+ MachineInstrBuilder LoadIndexIntoRegister =
+ BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+ NewRegister)
+ .addReg(SrcRegister)
+ .addImm(Lane)
+ .addReg(OffsetRegister, getKillRegState(true));
+ InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+ InsInstrs.push_back(LoadIndexIntoRegister);
+ return NewRegister;
+ };
+
+ // Helper to create load instruction based on opcode
+ auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
+ Register OffsetReg) -> MachineInstrBuilder {
+ unsigned Opcode;
+ switch (NumLanes) {
+ case 4:
+ Opcode = AArch64::LDRSui;
+ break;
+ case 8:
+ Opcode = AArch64::LDRHui;
+ break;
+ case 16:
+ Opcode = AArch64::LDRBui;
+ break;
+ default:
+ llvm_unreachable(
+ "Got unsupported number of lanes in machine-combiner gather pattern");
+ }
+ // Immediate offset load
+ return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+ .addReg(OffsetReg)
+ .addImm(0); // immediate offset
+ };
+
+ // Load the remaining lanes into register 0.
+ auto LanesToLoadToReg0 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+ LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+ auto PrevReg = SubregToReg->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+ LoadInstr->getOperand(3).getReg());
+ DelInstrs.push_back(LoadInstr);
+ }
+ auto LastLoadReg0 = PrevReg;
+
+ // First load into register 1. Perform a LDRSui to zero out the upper lanes in
+ // a single instruction.
+ auto Lane0Load = *LoadToLaneInstrsAscending.begin();
+ auto OriginalSplitLoad =
+ *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+ auto DestRegForMiddleIndex = MRI.createVirtualRegister(
+ MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+ MachineInstrBuilder MiddleIndexLoadInstr =
+ CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
+ OriginalSplitLoad->getOperand(3).getReg());
+
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+ InsInstrs.push_back(MiddleIndexLoadInstr);
+ DelInstrs.push_back(OriginalSplitLoad);
+
+ // Subreg To Reg instruction for register 1.
+ auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+ unsigned SubregType;
+ switch (NumLanes) {
+ case 4:
+ SubregType = AArch64::ssub;
+ break;
+ case 8:
+ SubregType = AArch64::hsub;
+ break;
+ case 16:
+ SubregType = AArch64::bsub;
+ break;
+ default:
+ llvm_unreachable(
+ "Got invalid NumLanes for machine-combiner gather pattern");
+ }
+
+ auto SubRegToRegInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+ DestRegForSubregToReg)
+ .addImm(0)
+ .addReg(DestRegForMiddleIndex, getKillRegState(true))
+ .addImm(SubregType);
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+ InsInstrs.push_back(SubRegToRegInstr);
+
+ // Load remaining lanes into register 1.
+ auto LanesToLoadToReg1 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+ LoadToLaneInstrsAscending.end());
+ PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+ LoadInstr->getOperand(3).getReg());
+ if (Index == NumLanes / 2 - 2) {
+ break;
+ }
+ DelInstrs.push_back(LoadInstr);
+ }
+ auto LastLoadReg1 = PrevReg;
+
+ // Create the final zip instruction to combine the results.
+ MachineInstrBuilder ZipInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+ Root.getOperand(0).getReg())
+ .addReg(LastLoadReg0)
+ .addReg(LastLoadReg1);
+ InsInstrs.push_back(ZipInstr);
+}
+
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7401,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
+ // Load patterns
+ if (getLoadPatterns(Root, Patterns))
+ return true;
+
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8656,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 4);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 8);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 16);
+ break;
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
@@ -9561,10 +9850,15 @@ AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
};
auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
// At least one unsafe register is not dead. We do not want to outline at
- // this point. If it is long enough to outline from, save the range
- // [RangeBegin, RangeEnd).
- if (RangeLen > 1)
- Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
+ // this point. If it is long enough to outline from and does not cross a
+ // bundle boundary, save the range [RangeBegin, RangeEnd).
+ if (RangeLen <= 1)
+ return;
+ if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
+ return;
+ if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
+ return;
+ Ranges.emplace_back(RangeBegin, RangeEnd);
};
// Find the first point where all unsafe registers are dead.
// FIND: <safe instr> <-- end of first potential range
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4..02734866e712 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
+
+ GATHER_LANE_i32,
+ GATHER_LANE_i16,
+ GATHER_LANE_i8
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ddc685fae5e9..6c46b18d506c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -419,6 +419,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
+def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
+
//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
@@ -1022,6 +1024,7 @@ def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull,
[SDNPCommutative]>;
def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull,
[SDNPCommutative]>;
+def AArch64sqdmulh : SDNode<"AArch64ISD::SQDMULH", SDT_AArch64mull>;
// Reciprocal estimates and steps.
def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
@@ -7376,6 +7379,7 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
(i64 0)),
dsub)>;
+let Predicates = [UseWzrToVecMove] in {
def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
(INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
@@ -7386,6 +7390,7 @@ def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm))
(EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>;
def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)),
(INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
+}
def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
(f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
@@ -9439,6 +9444,15 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+def : Pat<(v4i16 (AArch64sqdmulh (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+ (SQDMULHv4i16 V64:$Rn, V64:$Rm)>;
+def : Pat<(v2i32 (AArch64sqdmulh (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+ (SQDMULHv2i32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v8i16 (AArch64sqdmulh (v8i16 V128:$Rn), (v8i16 V128:$Rm))),
+ (SQDMULHv8i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i32 (AArch64sqdmulh (v4i32 V128:$Rn), (v4i32 V128:$Rm))),
+ (SQDMULHv4i32 V128:$Rn, V128:$Rm)>;
+
// Conversions within AdvSIMD types in the same register size are free.
// But because we need a consistent lane ordering, in big endian many
// conversions require one or more REV instructions.
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index e6b22695761e..782d62a7e5e1 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1666,7 +1666,7 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
"Given Opc should be a Load or Store with an immediate");
// OpcA will be the first instruction in the pair.
if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
- Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
+ Flags.setSExtIdx(NonSExtOpc == OpcA ? 1 : 0);
return true;
}
@@ -3078,7 +3078,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
return false;
Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
- TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
+ TII = Subtarget->getInstrInfo();
TRI = Subtarget->getRegisterInfo();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 5379305bc7a7..adc984ad795a 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
"Cortex-A320 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeaturePostRAScheduler]>;
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove]>;
def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeatureBalanceFPOps,
- FeaturePostRAScheduler]>;
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove]>;
def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
"Cortex-A55 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
- FeatureFuseAddress]>;
+ FeatureFuseAddress,
+ FeatureUseWzrToVecMove]>;
def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
"Cortex-A510 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeaturePostRAScheduler
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove
]>;
def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
"Cortex-A520 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeaturePostRAScheduler]>;
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove]>;
def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
"Cortex-A520AE ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeaturePostRAScheduler]>;
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove]>;
def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
"Cortex-A57 ARM processors", [
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index dd23bf51a98c..77dfab83a834 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1370,3 +1370,8 @@ bool AArch64RegisterInfo::shouldAnalyzePhysregInMachineLoopInfo(
MCRegister R) const {
return R == AArch64::VG;
}
+
+bool AArch64RegisterInfo::isIgnoredCVReg(MCRegister LLVMReg) const {
+ return (LLVMReg >= AArch64::Z0 && LLVMReg <= AArch64::Z31) ||
+ (LLVMReg >= AArch64::P0 && LLVMReg <= AArch64::P15);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index cc94be611a2e..1ed8e959fdd2 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -154,6 +154,8 @@ public:
SmallVectorImpl<uint64_t> &Ops) const override;
bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const override;
+
+ virtual bool isIgnoredCVReg(MCRegister LLVMReg) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index eddb96979f7b..0c4b4f4c3ed8 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2625,16 +2625,22 @@ let Predicates = [HasSVE_or_SME] in {
// These get expanded to individual LDR_ZXI/STR_ZXI instructions in
// AArch64ExpandPseudoInsts.
let mayLoad = 1, hasSideEffects = 0 in {
- def LDR_ZZXI : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_ZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+ def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
- def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
- def LDR_PPXI : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_PPXI : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
}
let mayStore = 1, hasSideEffects = 0 in {
- def STR_ZZXI : Pseudo<(outs), (ins ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_ZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+ def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
- def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
- def STR_PPXI : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_PPXI : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
}
let AddedComplexity = 1 in {
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 8d3a4553d4b7..b2c3da03b4b8 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -157,6 +157,7 @@ def V2Write_20c_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 20;
def V2Write_2c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 2; }
def V2Write_2c_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 2; }
def V2Write_3c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 3; }
+def V2Write_3c_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 3; }
def V2Write_4c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 4; }
def V2Write_4c_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
def V2Write_6c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 6; }
@@ -256,8 +257,8 @@ def V2Write_4c_1L01_1V01 : SchedWriteRes<[V2UnitL01, V2UnitV01]> {
let NumMicroOps = 2;
}
-def V2Write_4c_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> {
- let Latency = 4;
+def V2Write_5c_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> {
+ let Latency = 5;
let NumMicroOps = 2;
}
@@ -376,8 +377,8 @@ def V2Write_6c_1L_1S : SchedWriteRes<[V2UnitL, V2UnitS]> {
let NumMicroOps = 2;
}
-def V2Write_4c_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> {
- let Latency = 4;
+def V2Write_6c_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> {
+ let Latency = 6;
let NumMicroOps = 2;
}
@@ -1468,14 +1469,14 @@ def : SchedAlias<WriteVq, V2Write_2c_1V>;
def : InstRW<[V2Wr_VA, V2Rd_VA], (instregex "^[SU]ABAL?v")>;
// ASIMD arith, reduce, 4H/4S
-def : InstRW<[V2Write_2c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+def : InstRW<[V2Write_3c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
// ASIMD arith, reduce, 8B/8H
-def : InstRW<[V2Write_4c_1V13_1V],
+def : InstRW<[V2Write_5c_1V13_1V],
(instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
// ASIMD arith, reduce, 16B
-def : InstRW<[V2Write_4c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
+def : InstRW<[V2Write_6c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
// ASIMD dot product
// ASIMD dot product using signed and unsigned integers
@@ -1486,15 +1487,15 @@ def : InstRW<[V2Wr_VDOT, V2Rd_VDOT],
def : InstRW<[V2Wr_VMMA, V2Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
// ASIMD max/min, reduce, 4H/4S
-def : InstRW<[V2Write_2c_1V13], (instregex "^[SU](MAX|MIN)Vv4i16v$",
+def : InstRW<[V2Write_3c_1V13], (instregex "^[SU](MAX|MIN)Vv4i16v$",
"^[SU](MAX|MIN)Vv4i32v$")>;
// ASIMD max/min, reduce, 8B/8H
-def : InstRW<[V2Write_4c_1V13_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
+def : InstRW<[V2Write_5c_1V13_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
"^[SU](MAX|MIN)Vv8i16v$")>;
// ASIMD max/min, reduce, 16B
-def : InstRW<[V2Write_4c_2V13], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
+def : InstRW<[V2Write_6c_2V13], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
// ASIMD multiply
def : InstRW<[V2Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index c9e729025c70..dd775da97112 100644
--- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -133,7 +133,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
if (!ST.enableStorePairSuppress())
return false;
- TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+ TII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 095682334679..2409cc862f21 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -270,6 +270,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
break;
case NeoverseV2:
case NeoverseV3:
+ CacheLineSize = 64;
EpilogueVectorizationMinVF = 8;
MaxInterleaveFactor = 4;
ScatterOverhead = 13;
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 1f3d619f6dd8..1b0e90b0e0dc 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -2387,6 +2387,9 @@ def : RWSysReg<"TRBSR_EL3", 0b11, 0b110, 0b1001, 0b1011, 0b011>;
// v9.6 FEAT_PoPS
//
let Requires = [{ {AArch64::FeaturePoPS} }] in {
-def : DC<"CIGDVAPS", 0b000, 0b0111, 0b1111, 0b101>;
def : DC<"CIVAPS", 0b000, 0b0111, 0b1111, 0b001>;
}
+
+let Requires = [{ {AArch64::FeaturePoPS, AArch64::FeatureMTE} }] in {
+def : DC<"CIGDVAPS", 0b000, 0b0111, 0b1111, 0b101>;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 20e772655811..90d3d92d6bbf 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2674,14 +2674,14 @@ static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
IntrinsicInst &II) {
// If this barrier is post-dominated by identical one we can remove it
- auto *NI = II.getNextNonDebugInstruction();
+ auto *NI = II.getNextNode();
unsigned LookaheadThreshold = DMBLookaheadThreshold;
auto CanSkipOver = [](Instruction *I) {
return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
};
while (LookaheadThreshold-- && CanSkipOver(NI)) {
auto *NIBB = NI->getParent();
- NI = NI->getNextNonDebugInstruction();
+ NI = NI->getNextNode();
if (!NI) {
if (auto *SuccBB = NIBB->getUniqueSuccessor())
NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
@@ -2723,6 +2723,16 @@ static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
return std::nullopt;
}
+static std::optional<Instruction *>
+instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II) {
+ SMEAttrs FnSMEAttrs(*II.getFunction());
+ bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
+ if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
+ return IC.replaceInstUsesWith(
+ II, ConstantInt::getBool(II.getType(), IsStreaming));
+ return std::nullopt;
+}
+
std::optional<Instruction *>
AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const {
@@ -2828,6 +2838,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
return instCombineSVEUxt(IC, II, 16);
case Intrinsic::aarch64_sve_uxtw:
return instCombineSVEUxt(IC, II, 32);
+ case Intrinsic::aarch64_sme_in_streaming_mode:
+ return instCombineInStreamingMode(IC, II);
}
return std::nullopt;
@@ -3712,7 +3724,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
- bool HasRealUse, const Instruction *I, Value *Scalar,
+ const Instruction *I, Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert(Val->isVectorTy() && "This must be a vector type");
@@ -3732,12 +3744,10 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
}
// The element at index zero is already inside the vector.
- // - For a physical (HasRealUse==true) insert-element or extract-element
+ // - For a insert-element or extract-element
// instruction that extracts integers, an explicit FPR -> GPR move is
// needed. So it has non-zero cost.
- // - For the rest of cases (virtual instruction or element type is float),
- // consider the instruction free.
- if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
+ if (Index == 0 && !Val->getScalarType()->isIntegerTy())
return 0;
// This is recognising a LD1 single-element structure to one lane of one
@@ -3887,25 +3897,28 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
unsigned Index,
const Value *Op0,
const Value *Op1) const {
- bool HasRealUse =
- Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
- return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
+ // Treat insert at lane 0 into a poison vector as having zero cost. This
+ // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
+ // single dup) are treated as cheap.
+ if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
+ isa<PoisonValue>(Op0))
+ return 0;
+ return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
- return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
- Scalar, ScalarUserAndIdx);
+ return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
+ ScalarUserAndIdx);
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const {
- return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
- true /* HasRealUse */, &I);
+ return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
}
InstructionCost AArch64TTIImpl::getScalarizationOverhead(
@@ -4114,10 +4127,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
// SDIV/UDIV operations are lowered using SVE, then we can have less
// costs.
- if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
- ->getPrimitiveSizeInBits()
- .getFixedValue() < 128) {
- EVT VT = TLI->getValueType(DL, Ty);
+ if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
+ Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
static const CostTblEntry DivTbl[]{
{ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
{ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
@@ -4894,15 +4905,14 @@ void AArch64TTIImpl::getUnrollingPreferences(
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
+ // No need to unroll auto-vectorized loops
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+ return;
+
// Scan the loop: don't unroll loops with calls as this could prevent
- // inlining. Don't unroll vector loops either, as they don't benefit much from
- // unrolling.
+ // inlining.
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
- // Don't unroll vectorised loop.
- if (I.getType()->isVectorTy())
- return;
-
if (isa<CallBase>(I)) {
if (isa<CallInst>(I) || isa<InvokeInst>(I))
if (const Function *F = cast<CallBase>(I).getCalledFunction())
@@ -5201,33 +5211,34 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
static const CostTblEntry CostTblNoPairwise[]{
- {ISD::ADD, MVT::v8i8, 2},
- {ISD::ADD, MVT::v16i8, 2},
- {ISD::ADD, MVT::v4i16, 2},
- {ISD::ADD, MVT::v8i16, 2},
- {ISD::ADD, MVT::v4i32, 2},
- {ISD::ADD, MVT::v2i64, 2},
- {ISD::OR, MVT::v8i8, 15},
- {ISD::OR, MVT::v16i8, 17},
- {ISD::OR, MVT::v4i16, 7},
- {ISD::OR, MVT::v8i16, 9},
- {ISD::OR, MVT::v2i32, 3},
- {ISD::OR, MVT::v4i32, 5},
- {ISD::OR, MVT::v2i64, 3},
- {ISD::XOR, MVT::v8i8, 15},
- {ISD::XOR, MVT::v16i8, 17},
- {ISD::XOR, MVT::v4i16, 7},
- {ISD::XOR, MVT::v8i16, 9},
- {ISD::XOR, MVT::v2i32, 3},
- {ISD::XOR, MVT::v4i32, 5},
- {ISD::XOR, MVT::v2i64, 3},
- {ISD::AND, MVT::v8i8, 15},
- {ISD::AND, MVT::v16i8, 17},
- {ISD::AND, MVT::v4i16, 7},
- {ISD::AND, MVT::v8i16, 9},
- {ISD::AND, MVT::v2i32, 3},
- {ISD::AND, MVT::v4i32, 5},
- {ISD::AND, MVT::v2i64, 3},
+ {ISD::ADD, MVT::v8i8, 2},
+ {ISD::ADD, MVT::v16i8, 2},
+ {ISD::ADD, MVT::v4i16, 2},
+ {ISD::ADD, MVT::v8i16, 2},
+ {ISD::ADD, MVT::v2i32, 2},
+ {ISD::ADD, MVT::v4i32, 2},
+ {ISD::ADD, MVT::v2i64, 2},
+ {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
+ {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
+ {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
+ {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
+ {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
+ {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
+ {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
+ {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
+ {ISD::XOR, MVT::v16i8, 7},
+ {ISD::XOR, MVT::v4i16, 4},
+ {ISD::XOR, MVT::v8i16, 6},
+ {ISD::XOR, MVT::v2i32, 3},
+ {ISD::XOR, MVT::v4i32, 5},
+ {ISD::XOR, MVT::v2i64, 3},
+ {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
+ {ISD::AND, MVT::v16i8, 7},
+ {ISD::AND, MVT::v4i16, 4},
+ {ISD::AND, MVT::v8i16, 6},
+ {ISD::AND, MVT::v2i32, 3},
+ {ISD::AND, MVT::v4i32, 5},
+ {ISD::AND, MVT::v2i64, 3},
};
switch (ISD) {
default:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index ff0ab68a16a8..b27eb2ef7a39 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
// A helper function called by 'getVectorInstrCost'.
//
- // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
- // indicates whether the vector instruction is available in the input IR or
- // just imaginary in vectorizer passes.
- /// \param ScalarUserAndIdx encodes the information about extracts from a
+ // 'Val' and 'Index' are forwarded from 'getVectorInstrCost';
+ // \param ScalarUserAndIdx encodes the information about extracts from a
/// vector with 'Scalar' being the value being extracted,'User' being the user
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCostHelper(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
- bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
+ const Instruction *I = nullptr, Value *Scalar = nullptr,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
public:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 84884d98e6f9..b9d3e1bf835b 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -142,7 +142,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
uint64_t Value, MCContext &Ctx,
const Triple &TheTriple, bool IsResolved) {
int64_t SignedValue = static_cast<int64_t>(Value);
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
llvm_unreachable("Unknown fixup kind!");
case AArch64::fixup_aarch64_pcrel_adr_imm21:
@@ -417,7 +417,7 @@ static bool shouldForceRelocation(const MCFixup &Fixup) {
// same page as the ADRP and the instruction should encode 0x0. Assuming the
// section isn't 0x1000-aligned, we therefore need to delegate this decision
// to the linker -- a relocation!
- return Fixup.getTargetKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21;
+ return Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21;
}
void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
@@ -431,7 +431,7 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
if (mc::isRelocation(Kind))
return;
- if (Fixup.getTargetKind() == FK_Data_8 && TheTriple.isOSBinFormatELF()) {
+ if (Fixup.getKind() == FK_Data_8 && TheTriple.isOSBinFormatELF()) {
auto RefKind = static_cast<AArch64::Specifier>(Target.getSpecifier());
AArch64::Specifier SymLoc = AArch64::getSymbolLoc(RefKind);
if (SymLoc == AArch64::S_AUTH || SymLoc == AArch64::S_AUTHADDR) {
@@ -488,7 +488,7 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
AArch64::Specifier RefKind =
static_cast<AArch64::Specifier>(Target.getSpecifier());
if (AArch64::getSymbolLoc(RefKind) == AArch64::S_SABS ||
- (!RefKind && Fixup.getTargetKind() == AArch64::fixup_aarch64_movw)) {
+ (!RefKind && Fixup.getKind() == AArch64::fixup_aarch64_movw)) {
// If the immediate is negative, generate MOVN else MOVZ.
// (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ.
if (SignedValue < 0)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index c3881fc79ba6..7618a5769186 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -57,7 +57,7 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32)
// assumes IsILP32 is true
bool AArch64ELFObjectWriter::isNonILP32reloc(const MCFixup &Fixup,
AArch64::Specifier RefKind) const {
- if (Fixup.getTargetKind() != AArch64::fixup_aarch64_movw)
+ if (Fixup.getKind() != AArch64::fixup_aarch64_movw)
return false;
switch (RefKind) {
case AArch64::S_ABS_G3:
@@ -84,7 +84,7 @@ bool AArch64ELFObjectWriter::isNonILP32reloc(const MCFixup &Fixup,
unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
const MCValue &Target,
bool IsPCRel) const {
- unsigned Kind = Fixup.getTargetKind();
+ auto Kind = Fixup.getKind();
AArch64::Specifier RefKind =
static_cast<AArch64::Specifier>(Target.getSpecifier());
AArch64::Specifier SymLoc = AArch64::getSymbolLoc(RefKind);
@@ -212,7 +212,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
} else {
if (IsILP32 && isNonILP32reloc(Fixup, RefKind))
return ELF::R_AARCH64_NONE;
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case FK_Data_1:
reportError(Fixup.getLoc(), "1-byte data relocations not supported");
return ELF::R_AARCH64_NONE;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index f2144375fd95..08f547a85073 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -529,11 +529,9 @@ void AArch64TargetELFStreamer::finish() {
static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
bool Empty = true;
for (auto &F : *Text) {
- if (auto *DF = dyn_cast<MCDataFragment>(&F)) {
- if (!DF->getContents().empty()) {
- Empty = false;
- break;
- }
+ if (F.getSize()) {
+ Empty = false;
+ break;
}
}
if (Empty)
@@ -561,8 +559,7 @@ void AArch64TargetELFStreamer::finish() {
if (!Sym.isMemtag())
continue;
auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx);
- (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(),
- *Ctx.getSubtargetInfo());
+ S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE);
}
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index f918e3cbc7b8..5c8f57664a2c 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -356,7 +356,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
else if (TheTriple.isOSBinFormatCOFF())
MAI = new AArch64MCAsmInfoGNUCOFF();
else
- llvm_unreachable("Invalid target"); // FIXME: This is not unreachable
+ reportFatalUsageError("unsupported object format");
// Initial state of the frame pointer is SP.
unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 61458d7c24be..1ac340a1b58a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -53,7 +53,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
Log2Size = ~0U;
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 23f106a9c1d4..007b481f8496 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -153,6 +153,9 @@ private:
const TargetMachine &TM;
};
+void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
+extern char &AMDGPUPrepareAGPRAllocLegacyID;
+
void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &);
extern char &AMDGPUReserveWWMRegsLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 31420caca089..0e0e83b7a6b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
"Use scratch_* flat memory instructions to access scratch"
>;
+def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
+ "FlatGVSMode",
+ "true",
+ "Have GVS addressing mode with flat_* instructions"
+>;
+
def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
"AddNoCarryInsts",
"true",
@@ -541,6 +547,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;
+def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
+ "HasBF16TransInsts",
+ "true",
+ "Has bf16 transcendental instructions"
+>;
+
def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
"HasBF16ConversionInsts",
"true",
@@ -1106,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
"Has v_bitop3_b32/v_bitop3_b16 instructions"
>;
+def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
+ "HasTanhInsts",
+ "true",
+ "Has v_tanh_f32/f16 instructions"
+>;
+
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
@@ -1948,6 +1966,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureShaderCyclesHiLoRegisters,
FeatureArchitectedFlatScratch,
FeatureArchitectedSGPRs,
+ FeatureFlatGVSMode,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
FeatureAtomicDsPkAdd16Insts,
@@ -1966,7 +1985,9 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
+ FeatureTanhInsts,
FeatureTransposeLoadF4F6Insts,
+ FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
FeatureCvtPkF16F32Inst,
FeatureMinimum3Maximum3PKF16,
@@ -2374,6 +2395,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
+def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">,
+ AssemblerPredicate<(all_of FeatureFlatGVSMode)>;
+
def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
@@ -2442,6 +2466,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
+ AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
+
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
@@ -2657,6 +2684,9 @@ def HasDefaultComponentBroadcast
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
+def HasAddPC64Inst : Predicate<"Subtarget->hasAddPC64Inst()">,
+ AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">;
def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">;
@@ -2680,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
+def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
+ AssemblerPredicate<(all_of FeatureTanhInsts)>;
+
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 79cf49f88d6d..dedee46a4423 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,11 +13,9 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO/Attributor.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 22b921fb2084..5f1983791cfa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -45,12 +45,6 @@ static cl::opt<bool> WidenLoads(
cl::ReallyHidden,
cl::init(false));
-static cl::opt<bool> Widen16BitOps(
- "amdgpu-codegenprepare-widen-16-bit-ops",
- cl::desc(
- "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
- cl::ReallyHidden, cl::init(false));
-
static cl::opt<bool>
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
cl::desc("Break large PHI nodes for DAGISel"),
@@ -150,18 +144,6 @@ public:
bool canBreakPHINode(const PHINode &I);
- /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
- /// binary operation \p V.
- ///
- /// \returns Binary operation \p V.
- /// \returns \p T's base element bit width.
- unsigned getBaseElementBitWidth(const Type *T) const;
-
- /// \returns Equivalent 32 bit integer type for given type \p T. For example,
- /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
- /// is returned.
- Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
-
/// \returns True if binary operation \p I is a signed binary operation, false
/// otherwise.
bool isSigned(const BinaryOperator &I) const;
@@ -170,10 +152,6 @@ public:
/// signed 'icmp' operation, false otherwise.
bool isSigned(const SelectInst &I) const;
- /// \returns True if type \p T needs to be promoted to 32 bit integer type,
- /// false otherwise.
- bool needsPromotionToI32(const Type *T) const;
-
/// Return true if \p T is a legal scalar floating point type.
bool isLegalFloatingTy(const Type *T) const;
@@ -188,52 +166,6 @@ public:
computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();
}
- /// Promotes uniform binary operation \p I to equivalent 32 bit binary
- /// operation.
- ///
- /// \details \p I's base element bit width must be greater than 1 and less
- /// than or equal 16. Promotion is done by sign or zero extending operands to
- /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
- /// truncating the result of 32 bit binary operation back to \p I's original
- /// type. Division operation is not promoted.
- ///
- /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
- /// false otherwise.
- bool promoteUniformOpToI32(BinaryOperator &I) const;
-
- /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
- ///
- /// \details \p I's base element bit width must be greater than 1 and less
- /// than or equal 16. Promotion is done by sign or zero extending operands to
- /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
- ///
- /// \returns True.
- bool promoteUniformOpToI32(ICmpInst &I) const;
-
- /// Promotes uniform 'select' operation \p I to 32 bit 'select'
- /// operation.
- ///
- /// \details \p I's base element bit width must be greater than 1 and less
- /// than or equal 16. Promotion is done by sign or zero extending operands to
- /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
- /// result of 32 bit 'select' operation back to \p I's original type.
- ///
- /// \returns True.
- bool promoteUniformOpToI32(SelectInst &I) const;
-
- /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
- /// intrinsic.
- ///
- /// \details \p I's base element bit width must be greater than 1 and less
- /// than or equal 16. Promotion is done by zero extending the operand to 32
- /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
- /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
- /// shift amount is 32 minus \p I's base element bit width), and truncating
- /// the result of the shift operation back to \p I's original type.
- ///
- /// \returns True.
- bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
-
/// \returns The minimum number of bits needed to store the value of \Op as an
/// unsigned integer. Truncating to this size and then zero-extending to
/// the original will not change the value.
@@ -320,13 +252,11 @@ public:
bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);
bool visitLoadInst(LoadInst &I);
- bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);
bool visitPHINode(PHINode &I);
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
- bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
bool run();
@@ -380,22 +310,6 @@ bool AMDGPUCodeGenPrepareImpl::run() {
return MadeChange;
}
-unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
- assert(needsPromotionToI32(T) && "T does not need promotion to i32");
-
- if (T->isIntegerTy())
- return T->getIntegerBitWidth();
- return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
-}
-
-Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
- assert(needsPromotionToI32(T) && "T does not need promotion to i32");
-
- if (T->isIntegerTy())
- return B.getInt32Ty();
- return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
-}
-
bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
return I.getOpcode() == Instruction::AShr ||
I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
@@ -406,59 +320,11 @@ bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
cast<ICmpInst>(I.getOperand(0))->isSigned();
}
-bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
- if (!Widen16BitOps)
- return false;
-
- const IntegerType *IntTy = dyn_cast<IntegerType>(T);
- if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
- return true;
-
- if (const VectorType *VT = dyn_cast<VectorType>(T)) {
- // TODO: The set of packed operations is more limited, so may want to
- // promote some anyway.
- if (ST.hasVOP3PInsts())
- return false;
-
- return needsPromotionToI32(VT->getElementType());
- }
-
- return false;
-}
-
bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
return Ty->isFloatTy() || Ty->isDoubleTy() ||
(Ty->isHalfTy() && ST.has16BitInsts());
}
-// Return true if the op promoted to i32 should have nsw set.
-static bool promotedOpIsNSW(const Instruction &I) {
- switch (I.getOpcode()) {
- case Instruction::Shl:
- case Instruction::Add:
- case Instruction::Sub:
- return true;
- case Instruction::Mul:
- return I.hasNoUnsignedWrap();
- default:
- return false;
- }
-}
-
-// Return true if the op promoted to i32 should have nuw set.
-static bool promotedOpIsNUW(const Instruction &I) {
- switch (I.getOpcode()) {
- case Instruction::Shl:
- case Instruction::Add:
- case Instruction::Mul:
- return true;
- case Instruction::Sub:
- return I.hasNoUnsignedWrap();
- default:
- return false;
- }
-}
-
bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
Type *Ty = I.getType();
int TySize = DL.getTypeSizeInBits(Ty);
@@ -467,134 +333,6 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
}
-bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
- assert(needsPromotionToI32(I.getType()) &&
- "I does not need promotion to i32");
-
- if (I.getOpcode() == Instruction::SDiv ||
- I.getOpcode() == Instruction::UDiv ||
- I.getOpcode() == Instruction::SRem ||
- I.getOpcode() == Instruction::URem)
- return false;
-
- IRBuilder<> Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- Type *I32Ty = getI32Ty(Builder, I.getType());
- Value *ExtOp0 = nullptr;
- Value *ExtOp1 = nullptr;
- Value *ExtRes = nullptr;
- Value *TruncRes = nullptr;
-
- if (isSigned(I)) {
- ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
- ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
- } else {
- ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
- ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
- }
-
- ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
- if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
- if (promotedOpIsNSW(cast<Instruction>(I)))
- Inst->setHasNoSignedWrap();
-
- if (promotedOpIsNUW(cast<Instruction>(I)))
- Inst->setHasNoUnsignedWrap();
-
- if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
- Inst->setIsExact(ExactOp->isExact());
- }
-
- TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
-
- I.replaceAllUsesWith(TruncRes);
- I.eraseFromParent();
-
- return true;
-}
-
-bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
- assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
- "I does not need promotion to i32");
-
- IRBuilder<> Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
- Value *ExtOp0 = nullptr;
- Value *ExtOp1 = nullptr;
- Value *NewICmp = nullptr;
-
- if (I.isSigned()) {
- ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
- ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
- } else {
- ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
- ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
- }
- NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
-
- I.replaceAllUsesWith(NewICmp);
- I.eraseFromParent();
-
- return true;
-}
-
-bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
- assert(needsPromotionToI32(I.getType()) &&
- "I does not need promotion to i32");
-
- IRBuilder<> Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- Type *I32Ty = getI32Ty(Builder, I.getType());
- Value *ExtOp1 = nullptr;
- Value *ExtOp2 = nullptr;
- Value *ExtRes = nullptr;
- Value *TruncRes = nullptr;
-
- if (isSigned(I)) {
- ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
- ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
- } else {
- ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
- ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
- }
- ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
- TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
-
- I.replaceAllUsesWith(TruncRes);
- I.eraseFromParent();
-
- return true;
-}
-
-bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
- IntrinsicInst &I) const {
- assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
- "I must be bitreverse intrinsic");
- assert(needsPromotionToI32(I.getType()) &&
- "I does not need promotion to i32");
-
- IRBuilder<> Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- Type *I32Ty = getI32Ty(Builder, I.getType());
- Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
- Value *ExtRes =
- Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
- Value *LShrOp =
- Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
- Value *TruncRes =
- Builder.CreateTrunc(LShrOp, I.getType());
-
- I.replaceAllUsesWith(TruncRes);
- I.eraseFromParent();
-
- return true;
-}
-
unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
return computeKnownBits(Op, DL, AC).countMaxActiveBits();
}
@@ -1635,10 +1373,6 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
if (foldBinOpIntoSelect(I))
return true;
- if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
- UA.isUniform(&I) && promoteUniformOpToI32(I))
- return true;
-
if (UseMul24Intrin && replaceMulWithMul24(I))
return true;
if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
@@ -1770,16 +1504,6 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
return false;
}
-bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
- bool Changed = false;
-
- if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
- UA.isUniform(&I))
- Changed |= promoteUniformOpToI32(I);
-
- return Changed;
-}
-
bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
Value *Cond = I.getCondition();
Value *TrueVal = I.getTrueValue();
@@ -1787,12 +1511,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
Value *CmpVal;
CmpPredicate Pred;
- if (ST.has16BitInsts() && needsPromotionToI32(I.getType())) {
- if (UA.isUniform(&I))
- return promoteUniformOpToI32(I);
- return false;
- }
-
// Match fract pattern with nan check.
if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
return false;
@@ -2196,8 +1914,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
- case Intrinsic::bitreverse:
- return visitBitreverseIntrinsicInst(I);
case Intrinsic::minnum:
case Intrinsic::minimumnum:
case Intrinsic::minimum:
@@ -2209,16 +1925,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
}
}
-bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
- bool Changed = false;
-
- if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
- UA.isUniform(&I))
- Changed |= promoteUniformBitreverseToI32(I);
-
- return Changed;
-}
-
/// Match non-nan fract pattern.
/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc55..7b5d4077e85f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -55,6 +55,14 @@ def gi_vop3pmodsneg :
GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">,
GIComplexPatternEquiv<VOP3PModsNeg>;
+def gi_vop3pmodsnegs :
+ GIComplexOperandMatcher<s32, "selectVOP3PModsNegs">,
+ GIComplexPatternEquiv<VOP3PModsNegs>;
+
+def gi_dotiuvop3pmodsnegabs :
+ GIComplexOperandMatcher<s32, "selectVOP3PModsNegAbs">,
+ GIComplexPatternEquiv<VOP3PModsNegAbs>;
+
def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
@@ -83,6 +91,10 @@ def gi_swmmacindex16 :
GIComplexOperandMatcher<s32, "selectSWMMACIndex16">,
GIComplexPatternEquiv<SWMMACIndex16>;
+def gi_swmmacindex32 :
+ GIComplexOperandMatcher<s64, "selectSWMMACIndex32">,
+ GIComplexPatternEquiv<SWMMACIndex32>;
+
def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 202693b31612..25672a52345c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -447,6 +447,35 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
return;
}
+ bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
+ if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
+ CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
+ uint64_t C = 0;
+ bool AllConst = true;
+ unsigned EltSize = EltVT.getSizeInBits();
+ for (unsigned I = 0; I < NumVectorElts; ++I) {
+ SDValue Op = N->getOperand(I);
+ if (Op.isUndef()) {
+ AllConst = false;
+ break;
+ }
+ uint64_t Val;
+ if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {
+ Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
+ } else
+ Val = cast<ConstantSDNode>(Op)->getZExtValue();
+ C |= Val << (EltSize * I);
+ }
+ if (AllConst) {
+ SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
+ MachineSDNode *Copy =
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
+ CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
+ RegClass);
+ return;
+ }
+ }
+
assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
"supported yet");
// 32 = Max Num Vector Elements
@@ -454,7 +483,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
// 1 = Vector Register Class
SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
- bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
unsigned NOps = N->getNumOperands();
@@ -676,7 +704,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::Constant:
case ISD::ConstantFP: {
- if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+ if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
+ Subtarget->has64BitLiterals())
break;
uint64_t Imm;
@@ -1632,8 +1661,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SDValue &SRsrc,
SDValue &SOffset,
SDValue &Offset) const {
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
@@ -3245,6 +3273,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
return SelectVOP3PMods(In, Src, SrcMods, true);
}
+// Select neg_lo from the i1 immediate operand.
bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
@@ -3260,6 +3289,47 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
return true;
}
+// Select both neg_lo and neg_hi from the i1 immediate operand. This is
+// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies
+// to matrix's even k elements, and neg_hi applies to matrix's odd k elements.
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegs(SDValue In, SDValue &Src) const {
+ const ConstantSDNode *C = cast<ConstantSDNode>(In);
+ // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+ // 1 promotes packed values to signed, 0 treats them as unsigned.
+ assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
+
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned SrcSign = C->getZExtValue();
+ if (SrcSign == 1)
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+
+ Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+// Select neg, abs, or both neg and abs from the i16 immediate operans.
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const {
+ const ConstantSDNode *C = cast<ConstantSDNode>(In);
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned SrcMod = C->getZExtValue();
+ switch (SrcMod) {
+ default: // Any other value will be silently ignored (considered as 0).
+ break;
+ case 1:
+ Mods ^= SISrcMods::NEG;
+ break;
+ case 2:
+ Mods ^= SISrcMods::ABS;
+ break;
+ case 3:
+ Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
+ break;
+ }
+
+ Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
@@ -3611,6 +3681,41 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
+ SDValue &IndexKey) const {
+ unsigned Key = 0;
+ Src = In;
+
+ SDValue InI32;
+
+ if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
+ const SDValue &ExtendSrc = In.getOperand(0);
+ if (ExtendSrc.getValueSizeInBits() == 32)
+ InI32 = ExtendSrc;
+ } else if (In->getOpcode() == ISD::BITCAST) {
+ const SDValue &CastSrc = In.getOperand(0);
+ if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
+ CastSrc.getOperand(0).getValueSizeInBits() == 32) {
+ ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
+ if (Zero && Zero->getZExtValue() == 0)
+ InI32 = CastSrc.getOperand(0);
+ }
+ }
+
+ if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
+ ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
+ if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
+ EltIdx->getZExtValue() == 1) {
+ Key = 1;
+ Src = ExtractVecEltSrc;
+ }
+ }
+
+ IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
@@ -3885,10 +3990,8 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
- const SIRegisterInfo *SIRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
- const SIInstrInfo * SII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
+ const SIInstrInfo *SII = Subtarget->getInstrInfo();
unsigned Limit = 0;
bool AllUsesAcceptSReg = true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f3b9364fdb92..9967f46e085e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -222,6 +222,8 @@ private:
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
+ bool SelectVOP3PModsNegs(SDValue In, SDValue &Src) const;
+ bool SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
@@ -233,6 +235,7 @@ private:
bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const;
bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+ bool SelectSWMMACIndex32(SDValue In, SDValue &Src, SDValue &IndexKey) const;
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e64d2162441a..3d040fb705a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4006,7 +4006,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
- case Intrinsic::amdgcn_rsq_clamp: {
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_tanh: {
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(1);
return Src.isUndef() ? Src : SDValue();
@@ -4842,11 +4843,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
+// Detect when CMP and SELECT use the same constant and fold them to avoid
+// loading the constant twice. Specifically handles patterns like:
+// %cmp = icmp eq i32 %val, 4242
+// %sel = select i1 %cmp, i32 4242, i32 %other
+// It can be optimized to reuse %val instead of 4242 in select.
+static SDValue
+foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AMDGPUSubtarget *ST) {
+ SDValue Cond = N->getOperand(0);
+ SDValue TrueVal = N->getOperand(1);
+ SDValue FalseVal = N->getOperand(2);
+
+ // Check if condition is a comparison.
+ if (Cond.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
+ bool isInteger = LHS.getValueType().isInteger();
+
+ // Handle simple floating-point and integer types only.
+ if (!isFloatingPoint && !isInteger)
+ return SDValue();
+
+ bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
+ bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
+ if (!isEquality && !isNonEquality)
+ return SDValue();
+
+ SDValue ArgVal, ConstVal;
+ if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
+ (isInteger && isa<ConstantSDNode>(RHS))) {
+ ConstVal = RHS;
+ ArgVal = LHS;
+ } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
+ (isInteger && isa<ConstantSDNode>(LHS))) {
+ ConstVal = LHS;
+ ArgVal = RHS;
+ } else {
+ return SDValue();
+ }
+
+ // Check if constant should not be optimized - early return if not.
+ if (isFloatingPoint) {
+ const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
+ const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST);
+
+ // Only optimize normal floating-point values (finite, non-zero, and
+ // non-subnormal as per IEEE 754), skip optimization for inlinable
+ // floating-point constants.
+ if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val))
+ return SDValue();
+ } else {
+ int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
+
+ // Skip optimization for inlinable integer immediates.
+ // Inlinable immediates include: -16 to 64 (inclusive).
+ if (IntVal >= -16 && IntVal <= 64)
+ return SDValue();
+ }
+
+ // For equality and non-equality comparisons, patterns:
+ // select (setcc x, const), const, y -> select (setcc x, const), x, y
+ // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
+ if (!(isEquality && TrueVal == ConstVal) &&
+ !(isNonEquality && FalseVal == ConstVal))
+ return SDValue();
+
+ SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
+ SDValue SelectRHS =
+ (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
+ return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
+ SelectLHS, SelectRHS);
+}
+
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
return Folded;
+ // Try to fold CMP + SELECT patterns with shared constants (both FP and
+ // integer).
+ if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget))
+ return Folded;
+
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
@@ -5733,6 +5817,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+ NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
NODE_NAME_CASE(LDS)
NODE_NAME_CASE(DUMMY_CHAIN)
NODE_NAME_CASE(LOAD_D16_HI)
@@ -6196,7 +6281,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
- case Intrinsic::amdgcn_rsq_clamp: {
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_tanh: {
if (SNaN)
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0dd2183b72b2..4e8c6c7ea3b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -545,6 +545,7 @@ enum NodeType : unsigned {
/// Pointer to the start of the shader's constant data.
CONST_DATA_PTR,
PC_ADD_REL_OFFSET,
+ PC_ADD_REL_OFFSET64,
LDS,
DUMMY_CHAIN,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 44eaebffb70d..9a90787963d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -25,6 +25,7 @@ namespace {
class AMDGPUInsertDelayAlu {
public:
+ const GCNSubtarget *ST;
const SIInstrInfo *SII;
const TargetRegisterInfo *TRI;
@@ -65,13 +66,16 @@ public:
// Types of delay that can be encoded in an s_delay_alu instruction.
enum DelayType { VALU, TRANS, SALU, OTHER };
- // Get the delay type for an instruction with the specified TSFlags.
- static DelayType getDelayType(uint64_t TSFlags) {
- if (TSFlags & SIInstrFlags::TRANS)
+ // Get the delay type for a MachineInstr.
+ DelayType getDelayType(const MachineInstr &MI) {
+ if (SIInstrInfo::isTRANS(MI))
return TRANS;
- if (TSFlags & SIInstrFlags::VALU)
+ // WMMA XDL ops are treated the same as TRANS.
+ if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI))
+ return TRANS;
+ if (SIInstrInfo::isVALU(MI))
return VALU;
- if (TSFlags & SIInstrFlags::SALU)
+ if (SIInstrInfo::isSALU(MI))
return SALU;
return OTHER;
}
@@ -368,7 +372,7 @@ public:
continue;
}
- DelayType Type = getDelayType(MI.getDesc().TSFlags);
+ DelayType Type = getDelayType(MI);
if (instructionWaitsForSGPRWrites(MI)) {
auto It = State.find(LastSGPRFromVALU);
@@ -456,12 +460,12 @@ public:
LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
<< "\n");
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasDelayAlu())
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasDelayAlu())
return false;
- SII = ST.getInstrInfo();
- TRI = ST.getRegisterInfo();
+ SII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
SchedModel = &SII->getSchedModel();
// Calculate the delay state for each basic block, iterating until we reach
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index b8996fb97f1c..e2c2e8912c71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -700,7 +700,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
case Intrinsic::amdgcn_sqrt:
- case Intrinsic::amdgcn_rsq: {
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_tanh: {
Value *Src = II.getArgOperand(0);
if (isa<PoisonValue>(Src))
return IC.replaceInstUsesWith(II, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ea79c57080fa..1a63c48e3666 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3513,6 +3513,25 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
return Register();
}
+Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
+ Register AnyExtSrc;
+ if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
+ return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
+
+ // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return Register();
+
+ assert(Def->getNumOperands() == 3 &&
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+
+ if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
+ return Def->getOperand(1).getReg();
+
+ return Register();
+}
+
bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
if (!Subtarget->hasVMemToLDSLoad())
return false;
@@ -4904,6 +4923,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
return selectVOP3PRetHelper(Root, true);
}
+// Select neg_lo from the i1 immediate operand.
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
@@ -4919,6 +4939,50 @@ AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
}};
}
+// Select both neg_lo and neg_hi from the i1 immediate operand. This is
+// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies
+// to matrix's even k elements, and neg_hi applies to matrix's odd k elements.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsNegs(MachineOperand &Root) const {
+ // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+ // Value is in Imm operand as i1 sign extended to int64_t.
+ // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
+ assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
+ "expected i1 value");
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (Root.getImm() == -1)
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+// Select neg, abs, or both neg and abs from the i16 immediate operans.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsNegAbs(MachineOperand &Root) const {
+
+ assert(Root.isImm() && "Modifier for C must be an immediate");
+
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ switch (Root.getImm()) {
+ default: // Any other value will be silently ignored (considered as 0).
+ break;
+ case 1:
+ Mods ^= SISrcMods::NEG;
+ break;
+ case 2:
+ Mods ^= SISrcMods::ABS;
+ break;
+ case 3:
+ Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
+ break;
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
MachineOperand &Root) const {
@@ -5150,6 +5214,35 @@ AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
+ Register Src =
+ getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+ unsigned Key = 0;
+
+ Register S32 = matchZeroExtendFromS32(*MRI, Src);
+ if (!S32)
+ S32 = matchAnyExtendFromS32(Src);
+
+ if (S32) {
+ const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
+ if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
+ assert(Def->getNumOperands() == 3);
+ Register DstReg1 = Def->getOperand(1).getReg();
+ if (mi_match(S32, *MRI,
+ m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
+ Src = Def->getOperand(2).getReg();
+ Key = 1;
+ }
+ }
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 8e9e573147a8..2cb7904d27cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -201,6 +201,10 @@ private:
InstructionSelector::ComplexRendererFns
selectVOP3PModsNeg(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PModsNegs(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PModsNegAbs(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
@@ -217,6 +221,8 @@ private:
selectSWMMACIndex8(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectSWMMACIndex16(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSWMMACIndex32(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;
@@ -411,6 +417,9 @@ private:
// shift amount operand's `ShAmtBits` bits is unneeded.
bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
+ /// Match an any extend from a 32-bit value to 64-bit.
+ Register matchAnyExtendFromS32(Register Reg) const;
+
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const AMDGPURegisterBankInfo &RBI;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index aa678df675fb..e7bf88d2ee5b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2932,14 +2932,22 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
- MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
- .addDef(PCReg);
+ if (ST.has64BitLiterals()) {
+ assert(GAFlags != SIInstrInfo::MO_NONE);
- MIB.addGlobalAddress(GV, Offset, GAFlags);
- if (GAFlags == SIInstrInfo::MO_NONE)
- MIB.addImm(0);
- else
- MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
+ MachineInstrBuilder MIB =
+ B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
+ MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
+ } else {
+ MachineInstrBuilder MIB =
+ B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
+
+ MIB.addGlobalAddress(GV, Offset, GAFlags);
+ if (GAFlags == SIInstrInfo::MO_NONE)
+ MIB.addImm(0);
+ else
+ MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
+ }
if (!B.getMRI()->getRegClassOrNull(PCReg))
B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
@@ -2955,6 +2963,15 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
MachineRegisterInfo &MRI) const {
bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
+ if (RequiresHighHalf && ST.has64BitLiterals()) {
+ if (!MRI.getRegClassOrNull(DstReg))
+ MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+ B.buildInstr(AMDGPU::S_MOV_B64)
+ .addDef(DstReg)
+ .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
+ return;
+ }
+
LLT S32 = LLT::scalar(32);
// Use the destination directly, if and only if we store the lower address
@@ -7622,6 +7639,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
case Intrinsic::amdgcn_image_bvh8_intersect_ray:
return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
+ Register Index = MI.getOperand(5).getReg();
+ LLT S64 = LLT::scalar(64);
+ if (MRI.getType(Index) != S64)
+ MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0));
+ return true;
+ }
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
@@ -7636,15 +7667,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
return true;
}
+ case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
Register Index = MI.getOperand(7).getReg();
- LLT S32 = LLT::scalar(32);
- if (MRI.getType(Index) != S32)
- MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
+ ? LLT::scalar(64)
+ : LLT::scalar(32);
+ if (MRI.getType(Index) != IdxTy)
+ MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0));
return true;
}
+
case Intrinsic::amdgcn_fmed3: {
GISelChangeObserver &Observer = Helper.Observer;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2dec16de940d..c84a0f6e3138 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -50,6 +50,7 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) {
default:
return AMDGPUMCExpr::S_None;
case SIInstrInfo::MO_GOTPCREL:
+ case SIInstrInfo::MO_GOTPCREL64:
return AMDGPUMCExpr::S_GOTPCREL;
case SIInstrInfo::MO_GOTPCREL32_LO:
return AMDGPUMCExpr::S_GOTPCREL32_LO;
@@ -59,10 +60,14 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) {
return AMDGPUMCExpr::S_REL32_LO;
case SIInstrInfo::MO_REL32_HI:
return AMDGPUMCExpr::S_REL32_HI;
+ case SIInstrInfo::MO_REL64:
+ return AMDGPUMCExpr::S_REL64;
case SIInstrInfo::MO_ABS32_LO:
return AMDGPUMCExpr::S_ABS32_LO;
case SIInstrInfo::MO_ABS32_HI:
return AMDGPUMCExpr::S_ABS32_HI;
+ case SIInstrInfo::MO_ABS64:
+ return AMDGPUMCExpr::S_ABS64;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 5d298304c27f..b6c6d927d0e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -114,7 +114,9 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse
MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
+MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
+MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
new file mode 100644
index 000000000000..3b06e9b00ac6
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
@@ -0,0 +1,108 @@
+//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Make simple transformations to relax register constraints for cases which can
+// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into
+// AGPR or VGPR with a pseudo with an AV_* class register constraint. This
+// allows later passes to inflate the register class if necessary. The register
+// allocator does not know to replace instructions to relax constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUPrepareAGPRAlloc.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc"
+
+namespace {
+
+class AMDGPUPrepareAGPRAllocImpl {
+private:
+ const SIInstrInfo &TII;
+ MachineRegisterInfo &MRI;
+
+public:
+ AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
+ : TII(*ST.getInstrInfo()), MRI(MRI) {}
+ bool run(MachineFunction &MF);
+};
+
+class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {
+ initializeAMDGPUPrepareAGPRAllocLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+ "AMDGPU Prepare AGPR Alloc", false, false)
+INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+ "AMDGPU Prepare AGPR Alloc", false, false)
+
+char AMDGPUPrepareAGPRAllocLegacy::ID = 0;
+
+char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID;
+
+bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+}
+
+PreservedAnalyses
+AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+ return PreservedAnalyses::all();
+}
+
+bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
+ if (MRI.isReserved(AMDGPU::AGPR0))
+ return false;
+
+ const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO);
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+ TII.isInlineConstant(MI, 1)) ||
+ (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOperand(1).isImm())) {
+ MI.setDesc(AVImmPseudo);
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
new file mode 100644
index 000000000000..dc598c98f241
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
@@ -0,0 +1,23 @@
+//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class AMDGPUPrepareAGPRAllocPass
+ : public PassInfoMixin<AMDGPUPrepareAGPRAllocPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 7a2a7fc250e2..f5e14c71b02d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -88,7 +88,7 @@ void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers(
// are %p and %s, which use to know if we
// are either storing a literal string or a
// pointer to the printf buffer.
- static const char ConvSpecifiers[] = "cdieEfgGaosuxXp";
+ static const char ConvSpecifiers[] = "cdieEfFgGaAosuxXp";
size_t CurFmtSpecifierIdx = 0;
size_t PrevFmtSpecifierIdx = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 6a59a28b1d32..411159c8aa33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -23,7 +23,6 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/Support/AMDGPUAddrSpace.h"
#define DEBUG_TYPE "amdgpu-regbanklegalize"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1483d97d23fc..bf2f37bddb9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4546,6 +4546,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_tanh:
case Intrinsic::amdgcn_fmul_legacy:
case Intrinsic::amdgcn_fma_legacy:
case Intrinsic::amdgcn_frexp_mant:
@@ -4557,6 +4558,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pk_u16:
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
+ case Intrinsic::amdgcn_sat_pk4_i4_i8:
+ case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
case Intrinsic::amdgcn_cubeid:
case Intrinsic::amdgcn_cubema:
@@ -4688,6 +4691,44 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
+ case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:
+ case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16:
+ case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
+ case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 46027b889023..8101c6898624 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -167,77 +167,39 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
Info.UsesVCC =
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+ Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
+ /*IncludeCalls=*/false);
+ if (ST.hasMAIInsts())
+ Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
+ /*IncludeCalls=*/false);
// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
- Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
- Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
- if (ST.hasMAIInsts())
- Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+ Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
+ /*IncludeCalls=*/false);
return Info;
}
int32_t MaxVGPR = -1;
- int32_t MaxAGPR = -1;
- int32_t MaxSGPR = -1;
Info.CalleeSegmentSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
- // TODO: Check regmasks? Do they occur anywhere except calls?
- for (const MachineOperand &MO : MI.operands()) {
- unsigned Width = 0;
- bool IsSGPR = false;
- bool IsAGPR = false;
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+ const MachineOperand &MO = MI.getOperand(I);
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
switch (Reg) {
- case AMDGPU::EXEC:
- case AMDGPU::EXEC_LO:
- case AMDGPU::EXEC_HI:
- case AMDGPU::SCC:
- case AMDGPU::M0:
- case AMDGPU::M0_LO16:
- case AMDGPU::M0_HI16:
- case AMDGPU::SRC_SHARED_BASE_LO:
- case AMDGPU::SRC_SHARED_BASE:
- case AMDGPU::SRC_SHARED_LIMIT_LO:
- case AMDGPU::SRC_SHARED_LIMIT:
- case AMDGPU::SRC_PRIVATE_BASE_LO:
- case AMDGPU::SRC_PRIVATE_BASE:
- case AMDGPU::SRC_PRIVATE_LIMIT_LO:
- case AMDGPU::SRC_PRIVATE_LIMIT:
- case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
- case AMDGPU::SGPR_NULL:
- case AMDGPU::SGPR_NULL64:
- case AMDGPU::MODE:
- continue;
-
case AMDGPU::NoRegister:
assert(MI.isDebugInstr() &&
"Instruction uses invalid noreg register");
continue;
- case AMDGPU::VCC:
- case AMDGPU::VCC_LO:
- case AMDGPU::VCC_HI:
- case AMDGPU::VCC_LO_LO16:
- case AMDGPU::VCC_LO_HI16:
- case AMDGPU::VCC_HI_LO16:
- case AMDGPU::VCC_HI_HI16:
- Info.UsesVCC = true;
- continue;
-
- case AMDGPU::FLAT_SCR:
- case AMDGPU::FLAT_SCR_LO:
- case AMDGPU::FLAT_SCR_HI:
- continue;
-
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
@@ -267,170 +229,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
break;
}
- if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
- AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
- AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 1;
- } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
- AMDGPU::VGPR_16RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 1;
- } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
- AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 1;
- } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 2;
- } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 3;
- } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 3;
- } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 3;
- } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 4;
- } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 5;
- } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 5;
- } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 5;
- } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 6;
- } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 6;
- } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 6;
- } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 7;
- } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 7;
- } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 7;
- } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 8;
- } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 8;
- } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 8;
- } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 9;
- } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 9;
- } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 9;
- } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 10;
- } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 10;
- } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 10;
- } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 11;
- } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 11;
- } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 11;
- } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 12;
- } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 12;
- } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 12;
- } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 16;
- } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 16;
- } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 16;
- } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 32;
- } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 32;
- } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 32;
- } else {
- // We only expect TTMP registers or registers that do not belong to
- // any RC.
- assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
- AMDGPU::TTMP_64RegClass.contains(Reg) ||
- AMDGPU::TTMP_128RegClass.contains(Reg) ||
- AMDGPU::TTMP_256RegClass.contains(Reg) ||
- AMDGPU::TTMP_512RegClass.contains(Reg) ||
- !TRI.getPhysRegBaseClass(Reg)) &&
- "Unknown register class");
- }
+ const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
+ assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
+ TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
+ AMDGPU::TTMP_64RegClass.contains(Reg) ||
+ AMDGPU::TTMP_128RegClass.contains(Reg) ||
+ AMDGPU::TTMP_256RegClass.contains(Reg) ||
+ AMDGPU::TTMP_512RegClass.contains(Reg)) &&
+ "Unknown register class");
+
+ if (!RC || !TRI.isVGPRClass(RC))
+ continue;
+
+ unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
- if (IsSGPR) {
- MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
- } else if (IsAGPR) {
- MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
- } else {
- MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
- }
+ MaxVGPR = std::max(MaxUsed, MaxVGPR);
}
if (MI.isCall()) {
@@ -492,9 +306,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
}
}
- Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
- Info.NumAGPR = MaxAGPR + 1;
return Info;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 1f6002a3c6a2..dfe0cbf18c47 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -341,6 +341,10 @@ foreach intr = AMDGPUWMMAIntrinsicsGFX11 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUWMMAIntrinsicsGFX1250 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUSWMMACIntrinsicsGFX1250 in
+def : SourceOfDivergence<intr>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 7c24f428d78e..1e44be8e4720 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ protected:
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
+ bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
@@ -202,6 +203,8 @@ public:
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;
+ bool hasBF16TransInsts() const { return HasBF16TransInsts; }
+
bool hasBF16ConversionInsts() const {
return HasBF16ConversionInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f4dc4a483181..c865082a1dce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -25,6 +25,7 @@
#include "AMDGPUMacroFusion.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUPreloadKernArgProlog.h"
+#include "AMDGPUPrepareAGPRAlloc.h"
#include "AMDGPURemoveIncompatibleFunctions.h"
#include "AMDGPUReserveWWMRegs.h"
#include "AMDGPUResourceUsageAnalysis.h"
@@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeGlobalISel(*PR);
initializeAMDGPUAsmPrinterPass(*PR);
initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
+ initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
initializeGCNDPPCombineLegacyPass(*PR);
initializeSILowerI1CopiesLegacyPass(*PR);
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
@@ -1196,6 +1198,7 @@ public:
bool addRegBankSelect() override;
void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
+ void addPreRegAlloc() override;
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
@@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() {
TargetPassConfig::addFastRegAlloc();
}
+void GCNPassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOptLevel::None)
+ addPass(&AMDGPUPrepareAGPRAllocLegacyID);
+}
+
void GCNPassConfig::addOptimizedRegAlloc() {
if (EnableDCEInRA)
insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
@@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
Base::addOptimizedRegAlloc(addPass);
}
+void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const {
+ if (getOptLevel() != CodeGenOptLevel::None)
+ addPass(AMDGPUPrepareAGPRAllocPass());
+}
+
Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
AddMachinePass &addPass) const {
// TODO: Check --regalloc-npm option
@@ -2284,6 +2297,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
Base::addPostRegAlloc(addPass);
}
+void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const {
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
+ addPass(SIShrinkInstructionsPass());
+ addPass(SIPostRABundlerPass());
+}
+
void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
addPass(GCNCreateVOPDPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 3c62cd19c6e5..e0f1296ddded 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -181,8 +181,11 @@ public:
void addMachineSSAOptimization(AddMachinePass &) const;
void addPostRegAlloc(AddMachinePass &) const;
void addPreEmitPass(AddMachinePass &) const;
+ void addPreEmitRegAlloc(AddMachinePass &) const;
Error addRegAssignmentOptimized(AddMachinePass &) const;
+ void addPreRegAlloc(AddMachinePass &) const;
void addOptimizedRegAlloc(AddMachinePass &) const;
+ void addPreSched2(AddMachinePass &) const;
/// Check if a pass is enabled given \p Opt option. The option always
/// overrides defaults if explicitly used. Otherwise its default will be used
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 6439230b8769..43d4e8db791b 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -157,6 +157,7 @@ public:
ImmTyNegHi,
ImmTyIndexKey8bit,
ImmTyIndexKey16bit,
+ ImmTyIndexKey32bit,
ImmTyDPP8,
ImmTyDppCtrl,
ImmTyDppRowMask,
@@ -174,8 +175,10 @@ public:
ImmTyWaitEXP,
ImmTyWaitVAVDst,
ImmTyWaitVMVSrc,
- ImmTyByteSel,
ImmTyBitOp3,
+ ImmTyMatrixAReuse,
+ ImmTyMatrixBReuse,
+ ImmTyByteSel,
};
// Immediate operand kind.
@@ -419,6 +422,9 @@ public:
bool isCPol() const { return isImmTy(ImmTyCPol); }
bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
+ bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); }
+ bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); }
+ bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
bool isDppFI() const { return isImmTy(ImmTyDppFI); }
@@ -747,6 +753,10 @@ public:
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64);
}
+ bool isVISrc_512_f64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f64);
+ }
+
bool isVISrc_128B16() const {
return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16);
}
@@ -1116,6 +1126,7 @@ public:
case ImmTyCPol: OS << "CPol"; break;
case ImmTyIndexKey8bit: OS << "index_key"; break;
case ImmTyIndexKey16bit: OS << "index_key"; break;
+ case ImmTyIndexKey32bit: OS << "index_key"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -1162,8 +1173,10 @@ public:
case ImmTyWaitEXP: OS << "WaitEXP"; break;
case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
- case ImmTyByteSel: OS << "ByteSel" ; break;
case ImmTyBitOp3: OS << "BitOp3"; break;
+ case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
+ case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
+ case ImmTyByteSel: OS << "ByteSel" ; break;
}
// clang-format on
}
@@ -1700,6 +1713,7 @@ public:
AMDGPUOperand::ImmTy ImmTy);
ParseStatus parseIndexKey8bit(OperandVector &Operands);
ParseStatus parseIndexKey16bit(OperandVector &Operands);
+ ParseStatus parseIndexKey32bit(OperandVector &Operands);
ParseStatus parseDfmtNfmt(int64_t &Format);
ParseStatus parseUfmt(int64_t &Format);
@@ -3981,8 +3995,8 @@ bool AMDGPUAsmParser::validateVOPD(const MCInst &Inst,
bool AsVOPD3 = MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3;
if (AsVOPD3) {
- for (unsigned I = 0, E = Operands.size(); I != E; ++I) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ for (const std::unique_ptr<MCParsedAsmOperand> &Operand : Operands) {
+ AMDGPUOperand &Op = (AMDGPUOperand &)*Operand;
if ((Op.isRegKind() || Op.isImmTy(AMDGPUOperand::ImmTyNone)) &&
(Op.getModifiers().getFPModifiersOperand() & SISrcMods::ABS))
Error(Op.getStartLoc(), "ABS not allowed in VOPD3 instructions");
@@ -7153,7 +7167,9 @@ ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands,
if (!Res.isSuccess())
return Res;
- if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1))
+ if ((ImmTy == AMDGPUOperand::ImmTyIndexKey16bit ||
+ ImmTy == AMDGPUOperand::ImmTyIndexKey32bit) &&
+ (ImmVal < 0 || ImmVal > 1))
return Error(Loc, Twine("out of range ", StringRef(Pref)));
if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3))
@@ -7171,6 +7187,10 @@ ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) {
return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit);
}
+ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
+ return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit);
+}
+
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -9272,6 +9292,14 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
DefaultVal);
}
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixAReuse, 0);
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_b_reuse))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyMatrixBReuse, 0);
+
int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
if (NegLoIdx != -1)
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
@@ -9378,6 +9406,10 @@ void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptIdx,
AMDGPUOperand::ImmTyIndexKey16bit);
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_32bit))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyIndexKey32bit);
+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClamp);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index e3519f192137..42edec0d0149 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
+ AMDGPUPrepareAGPRAlloc.cpp
AMDGPUSwLowerLDS.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 3625db9a4791..c8a4e22ed1da 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -200,6 +200,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{95-72} = !if(ps.has_offset, offset, ?);
}
+// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode.
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
bit IsSaddr = is_saddr;
string SaddrOp = Name;
@@ -237,10 +238,18 @@ class FLAT_Load_Pseudo<
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
-multiclass FLAT_Load_Pseudo_t16<string opName> {
- def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
+multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
+ def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>,
+ GlobalSaddrTable<0, opName>;
+ let OtherPredicates = [HasFlatGVSMode] in
+ def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+ GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
+ defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>;
let True16Predicate = UseRealTrue16Insts in
- def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
+ defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
}
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
@@ -260,10 +269,26 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
let enabled_saddr = EnableSaddr;
}
-multiclass FLAT_Store_Pseudo_t16<string opName> {
- def "" : FLAT_Store_Pseudo<opName, VGPR_32>;
- let OtherPredicates = [HasTrue16BitInsts] in
- def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> {
+ def "" : FLAT_Store_Pseudo<opName, regClass>,
+ GlobalSaddrTable<0, opName>;
+ let OtherPredicates = [HasFlatGVSMode] in
+ def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+ GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Store_Pseudo_t16<string opName> {
+ defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>;
+
+ defvar Name16 = opName#"_t16";
+ let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in {
+ def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>,
+ GlobalSaddrTable<0, Name16>,
+ True16D16Table<NAME#"_D16_HI", NAME>;
+ def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+ GlobalSaddrTable<1, Name16>,
+ True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
+ }
}
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
@@ -657,6 +682,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
let FPAtomic = data_vt.isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
+
+ def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
+ (outs),
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+ " $vaddr, $vdata, $saddr$offset$cpol">,
+ GlobalSaddrTable<1, opName> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let FPAtomic = data_vt.isFP;
+ let AddedComplexity = -1; // Prefer global atomics if available
+ }
}
multiclass FLAT_Atomic_Pseudo_RTN<
@@ -665,15 +702,29 @@ multiclass FLAT_Atomic_Pseudo_RTN<
ValueType vt,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
+ RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
+ (outs vdst_op:$vdst),
(ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn"> {
let FPAtomic = data_vt.isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
+
+ def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
+ (outs vdst_op:$vdst),
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
+ GlobalSaddrTable<1, opName#"_rtn"> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let PseudoInstr = NAME#"_SADDR_RTN";
+ let FPAtomic = data_vt.isFP;
+ let AddedComplexity = -1; // Prefer global atomics if available
+ }
}
multiclass FLAT_Atomic_Pseudo<
@@ -762,36 +813,36 @@ multiclass FLAT_Global_Atomic_Pseudo<
// Flat Instructions
//===----------------------------------------------------------------------===//
-def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
+defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
+defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>;
+defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>;
+defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>;
+defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
+defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
+defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
let SubtargetPredicate = HasD16LoadStore in {
let TiedSourceNotRead = 1 in {
-def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
-def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
-def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
+defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
+defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
+defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">;
}
-def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
-def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
+defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
+defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
}
-defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">;
-defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">;
+defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">;
+defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">;
defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
VGPR_32, i32, v2i32, VReg_64>;
@@ -1200,6 +1251,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
(inst $saddr, $voffset, $offset, 0, $in)
>;
+class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
+ (inst $saddr, $voffset, $offset, (i32 0), $in)
+>;
+
+class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
+ (inst $saddr, $voffset, $offset, (i32 0))
+>;
+
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
(inst $saddr, $voffset, $offset, (i32 0))
@@ -1210,13 +1271,13 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $vaddr, $offset)
>;
-class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
(inst $saddr, $voffset, $offset, 0)
>;
-class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
- ValueType vt> : GCNPat <
+class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> : GCNPat <
(node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)),
(inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
@@ -1394,7 +1455,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
let AddedComplexity = 10;
}
- def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1404,7 +1465,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu
let AddedComplexity = 10;
}
- def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1425,7 +1486,7 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
let AddedComplexity = 10;
}
- def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1435,7 +1496,7 @@ multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, Valu
let AddedComplexity = 10;
}
- def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1568,80 +1629,129 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
}
}
+multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat <inst, node, vt>;
+
+ def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_D16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_D16_t16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatStorePat <inst, node, vt>;
+
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>;
+
+ def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
let OtherPredicates = [HasFlatAddressSpace] in {
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in {
- def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
}
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+ defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+ defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
-def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
foreach vt = Reg32Types.types in {
-def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>;
}
foreach vt = VReg_64.RegTypes in {
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>;
}
-def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>;
foreach vt = VReg_128.RegTypes in {
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
}
-def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
-def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+
foreach as = [ "flat", "global" ] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
@@ -1684,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
} // end foreach as
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+
let SubtargetPredicate = isGFX12Plus in {
defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
@@ -1692,25 +1805,25 @@ let SubtargetPredicate = isGFX12Plus in {
}
let OtherPredicates = [HasD16LoadStore] in {
-def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
-def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
}
let OtherPredicates = [D16PreservesUnusedBits] in {
// TODO: Handle atomic loads
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
} // End OtherPredicates = [HasFlatAddressSpace]
@@ -1782,6 +1895,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
// appropriate waits.
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -1821,6 +1935,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -2832,14 +2947,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op,
VFLAT_Aliases_gfx12<name, alias>,
VFLAT_Real_gfx12<op, name>;
-multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
- string name = get_FLAT_ps<NAME>.Mnemonic,
- string alias = name> :
- VFLAT_Real_Base_gfx12<op, name, alias> {
- defm _RTN : VFLAT_Real_gfx12<op, name>;
-}
-
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic,
string alias = name> :
VFLAT_Real_Base_gfx12<op, name, alias> {
@@ -2853,7 +2961,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
}
}
-multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12_w64<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic> :
VFLAT_Aliases_gfx12<name> {
let DecoderNamespace = "GFX12W64" in {
@@ -2862,10 +2970,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
}
}
-multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op,
+multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic,
string alias = name> :
- VGLOBAL_Real_AllAddr_gfx12<op, name, alias> {
+ VFLAT_Real_AllAddr_gfx12<op, name, alias> {
defm _RTN : VFLAT_Real_gfx12<op, name>;
defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>;
}
@@ -2879,28 +2987,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op,
}
// ENC_VFLAT.
-defm FLAT_LOAD_UBYTE : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">;
-defm FLAT_LOAD_SBYTE : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">;
-defm FLAT_LOAD_USHORT : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">;
-defm FLAT_LOAD_SSHORT : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">;
-defm FLAT_LOAD_DWORD : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">;
-defm FLAT_LOAD_DWORDX2 : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">;
-defm FLAT_LOAD_DWORDX3 : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">;
-defm FLAT_LOAD_DWORDX4 : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">;
-defm FLAT_STORE_BYTE : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">;
-defm FLAT_STORE_SHORT : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">;
-defm FLAT_STORE_DWORD : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">;
-defm FLAT_STORE_DWORDX2 : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">;
-defm FLAT_STORE_DWORDX3 : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">;
-defm FLAT_STORE_DWORDX4 : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">;
-defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">;
-defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">;
-defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">;
-defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">;
-defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">;
-defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">;
-defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">;
-defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">;
+defm FLAT_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">;
+defm FLAT_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">;
+defm FLAT_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">;
+defm FLAT_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">;
+defm FLAT_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">;
+defm FLAT_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">;
+defm FLAT_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">;
+defm FLAT_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">;
+defm FLAT_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">;
+defm FLAT_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">;
+defm FLAT_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">;
+defm FLAT_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">;
+defm FLAT_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">;
+defm FLAT_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">;
+defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">;
+defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">;
+defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">;
+defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">;
+defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">;
+defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">;
defm FLAT_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">;
defm FLAT_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">;
defm FLAT_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">;
@@ -2936,74 +3044,74 @@ defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>;
defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>;
// ENC_VGLOBAL.
-defm GLOBAL_LOAD_UBYTE : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">;
-defm GLOBAL_LOAD_SBYTE : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">;
-defm GLOBAL_LOAD_USHORT : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">;
-defm GLOBAL_LOAD_SSHORT : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">;
-defm GLOBAL_LOAD_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">;
-defm GLOBAL_LOAD_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">;
-defm GLOBAL_LOAD_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">;
-defm GLOBAL_LOAD_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">;
-defm GLOBAL_STORE_BYTE : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">;
-defm GLOBAL_STORE_SHORT : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">;
-defm GLOBAL_STORE_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
-defm GLOBAL_STORE_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
-defm GLOBAL_STORE_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
-defm GLOBAL_STORE_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
-defm GLOBAL_LOAD_UBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
-defm GLOBAL_LOAD_SBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
-defm GLOBAL_LOAD_SHORT_D16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
-defm GLOBAL_LOAD_UBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
-defm GLOBAL_LOAD_SBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
-defm GLOBAL_LOAD_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
-defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
-defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
-defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
-defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
-defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>;
-defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>;
-
-defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
-defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
-defm GLOBAL_ATOMIC_ADD : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
-defm GLOBAL_ATOMIC_SUB : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
-defm GLOBAL_ATOMIC_CSUB : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
-defm GLOBAL_ATOMIC_SMIN : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
-defm GLOBAL_ATOMIC_UMIN : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
-defm GLOBAL_ATOMIC_SMAX : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
-defm GLOBAL_ATOMIC_UMAX : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
-defm GLOBAL_ATOMIC_AND : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
-defm GLOBAL_ATOMIC_OR : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
-defm GLOBAL_ATOMIC_XOR : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
-defm GLOBAL_ATOMIC_INC : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
-defm GLOBAL_ATOMIC_DEC : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
-defm GLOBAL_ATOMIC_SWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
-defm GLOBAL_ATOMIC_CMPSWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
-defm GLOBAL_ATOMIC_ADD_X2 : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
-defm GLOBAL_ATOMIC_SUB_X2 : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
-defm GLOBAL_ATOMIC_SMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
-defm GLOBAL_ATOMIC_UMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
-defm GLOBAL_ATOMIC_SMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
-defm GLOBAL_ATOMIC_UMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
-defm GLOBAL_ATOMIC_AND_X2 : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
-defm GLOBAL_ATOMIC_OR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
-defm GLOBAL_ATOMIC_XOR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
-defm GLOBAL_ATOMIC_INC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
-defm GLOBAL_ATOMIC_DEC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
-defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050>;
-defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
-defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
-defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>;
+defm GLOBAL_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">;
+defm GLOBAL_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">;
+defm GLOBAL_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">;
+defm GLOBAL_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">;
+defm GLOBAL_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">;
+defm GLOBAL_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">;
+defm GLOBAL_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">;
+defm GLOBAL_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">;
+defm GLOBAL_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">;
+defm GLOBAL_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">;
+defm GLOBAL_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
+defm GLOBAL_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
+defm GLOBAL_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
+defm GLOBAL_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
+defm GLOBAL_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
+defm GLOBAL_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
+defm GLOBAL_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
+defm GLOBAL_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
+defm GLOBAL_STORE_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
+defm GLOBAL_LOAD_BLOCK : VFLAT_Real_AllAddr_gfx12<0x053>;
+defm GLOBAL_STORE_BLOCK : VFLAT_Real_AllAddr_gfx12<0x054>;
+
+defm GLOBAL_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
+defm GLOBAL_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
+defm GLOBAL_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
+defm GLOBAL_ATOMIC_SUB : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
+defm GLOBAL_ATOMIC_CSUB : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_SMIN : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
+defm GLOBAL_ATOMIC_UMIN : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
+defm GLOBAL_ATOMIC_SMAX : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
+defm GLOBAL_ATOMIC_UMAX : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
+defm GLOBAL_ATOMIC_AND : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
+defm GLOBAL_ATOMIC_OR : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
+defm GLOBAL_ATOMIC_XOR : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
+defm GLOBAL_ATOMIC_INC : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
+defm GLOBAL_ATOMIC_DEC : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
+defm GLOBAL_ATOMIC_SWAP_X2 : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
+defm GLOBAL_ATOMIC_CMPSWAP_X2 : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
+defm GLOBAL_ATOMIC_ADD_X2 : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
+defm GLOBAL_ATOMIC_SUB_X2 : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
+defm GLOBAL_ATOMIC_SMIN_X2 : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
+defm GLOBAL_ATOMIC_UMIN_X2 : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
+defm GLOBAL_ATOMIC_SMAX_X2 : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
+defm GLOBAL_ATOMIC_UMAX_X2 : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
+defm GLOBAL_ATOMIC_AND_X2 : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
+defm GLOBAL_ATOMIC_OR_X2 : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
+defm GLOBAL_ATOMIC_XOR_X2 : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
+defm GLOBAL_ATOMIC_INC_X2 : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
+defm GLOBAL_ATOMIC_DEC_X2 : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
+defm GLOBAL_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050>;
+defm GLOBAL_ATOMIC_FMIN : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_FMAX : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
+defm GLOBAL_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>;
defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>;
defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>;
-defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>;
-defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>;
+defm GLOBAL_LOAD_TR_B128_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x057>;
+defm GLOBAL_LOAD_TR_B64_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x058>;
-defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>;
-defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>;
+defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>;
+defm GLOBAL_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>;
defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>;
defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 0976fccf78d8..bbed828b4fed 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1189,6 +1189,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
}
fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);
+ fixVALUTransCoexecutionHazards(MI);
fixWMMAHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
@@ -1809,6 +1810,51 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
return true;
}
+bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
+ if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
+ !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
+ if (!SIInstrInfo::isTRANS(I))
+ return false;
+
+ // RAW: Trans(I) writes, VALU(MI) reads.
+ Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ for (const MachineOperand &ValuUse : MI->explicit_uses()) {
+ if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
+ return true;
+ }
+
+ auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ if (!ValuDst || !ValuDst->isReg())
+ return false;
+
+ // WAR: Trans(I) reads, VALU(MI) writes.
+ Register ValuDef = ValuDst->getReg();
+ for (const MachineOperand &TransUse : I.explicit_uses()) {
+ if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
+ return true;
+ }
+
+ return false;
+ };
+
+ auto IsExpiredFn = [](const MachineInstr &I, int) {
+ return SIInstrInfo::isVALU(I);
+ };
+
+ const int HasVALU = std::numeric_limits<int>::max();
+ if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+ return true;
+}
+
bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index bbc55851bf96..ef6ddd874f58 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -104,6 +104,7 @@ private:
bool fixLdsDirectVMEMHazard(MachineInstr *MI);
bool fixVALUPartialForwardingHazard(MachineInstr *MI);
bool fixVALUTransUseHazard(MachineInstr *MI);
+ bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fce8f36d4596..a6553083d722 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() {
GCNRegPressure
GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
GCNDownwardRPTracker RPTracker(*LIS);
- RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+ RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
+ &LiveIns[RegionIdx]);
return RPTracker.moveMaxPressure();
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e6dd98a10420..268162bcada4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -214,6 +214,7 @@ protected:
bool FlatInstOffsets = false;
bool FlatGlobalInsts = false;
bool FlatScratchInsts = false;
+ bool FlatGVSMode = false;
bool ScalarFlatScratchInsts = false;
bool HasArchitectedFlatScratch = false;
bool EnableFlatScratch = false;
@@ -233,6 +234,7 @@ protected:
bool HasRestrictedSOffset = false;
bool Has64BitLiterals = false;
bool HasBitOp3Insts = false;
+ bool HasTanhInsts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
@@ -1156,10 +1158,12 @@ public:
bool hasMadF16() const;
- bool hasMovB64() const { return GFX940Insts; }
+ bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }
bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+ bool hasFlatGVSMode() const { return FlatGVSMode; }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
@@ -1377,6 +1381,10 @@ public:
return HasMinimum3Maximum3F16;
}
+ bool hasTanhInsts() const { return HasTanhInsts; }
+
+ bool hasAddPC64Inst() const { return GFX1250Insts; }
+
bool hasMinimum3Maximum3PKF16() const {
return HasMinimum3Maximum3PKF16;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index e7d0e1838fa6..2a920f6feb1c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -108,7 +108,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
MCContext *Ctx) {
int64_t SignedValue = static_cast<int64_t>(Value);
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case AMDGPU::fixup_si_sopp_br: {
int64_t BrImm = (SignedValue - 4) / 4;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 22ae5f4e7191..0d5a8be6220d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -64,6 +64,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup,
return ELF::R_AMDGPU_ABS32_LO;
case AMDGPUMCExpr::S_ABS32_HI:
return ELF::R_AMDGPU_ABS32_HI;
+ case AMDGPUMCExpr::S_ABS64:
+ return ELF::R_AMDGPU_ABS64;
}
MCFixupKind Kind = Fixup.getKind();
@@ -76,7 +78,7 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup,
return IsPCRel ? ELF::R_AMDGPU_REL64 : ELF::R_AMDGPU_ABS64;
}
- if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) {
+ if (Fixup.getKind() == AMDGPU::fixup_si_sopp_br) {
const auto *SymA = Target.getAddSym();
assert(SymA);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index cb6319ed627c..ec9248b972ec 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1332,6 +1332,16 @@ void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo,
O << " index_key:" << Imm;
}
+void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+ if (Imm == 0)
+ return;
+
+ O << " index_key:" << Imm;
+}
+
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index fb803b1f8134..e3299a618e88 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -132,6 +132,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printIndexKey16bit(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printIndexKey32bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 31dd373e54fb..ffdac8b8ce32 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -25,6 +25,7 @@ const MCAsmInfo::AtSpecifier atSpecifiers[] = {
{AMDGPUMCExpr::S_REL64, "rel64"},
{AMDGPUMCExpr::S_ABS32_LO, "abs32@lo"},
{AMDGPUMCExpr::S_ABS32_HI, "abs32@hi"},
+ {AMDGPUMCExpr::S_ABS64, "abs64"},
};
AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 4bb3942936f0..f48739fe0181 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -381,9 +381,11 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
// Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions.
// Note that accvgpr_read/write are MAI, have src0, but do not use op_sel.
- if ((Desc.TSFlags & SIInstrFlags::VOP3P) ||
- Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
- Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) {
+ if (((Desc.TSFlags & SIInstrFlags::VOP3P) ||
+ Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
+ Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
+ // Matrix B reuse operand reuses op_sel_hi.
+ !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
Encoding |= getImplicitOpSelHiEncoding(Opcode);
}
@@ -562,7 +564,8 @@ static bool needsPCRel(const MCExpr *Expr) {
case MCExpr::SymbolRef: {
auto *SE = cast<MCSymbolRefExpr>(Expr);
auto Spec = AMDGPU::getSpecifier(SE);
- return Spec != AMDGPUMCExpr::S_ABS32_LO && Spec != AMDGPUMCExpr::S_ABS32_HI;
+ return Spec != AMDGPUMCExpr::S_ABS32_LO &&
+ Spec != AMDGPUMCExpr::S_ABS32_HI && Spec != AMDGPUMCExpr::S_ABS64;
}
case MCExpr::Binary: {
auto *BE = cast<MCBinaryExpr>(Expr);
@@ -685,7 +688,12 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
uint32_t Offset = Desc.getSize();
assert(Offset == 4 || Offset == 8);
- addFixup(Fixups, Offset, MO.getExpr(), FK_Data_4, PCRel);
+ auto OpType = Desc.operands()[OpNo].OperandType;
+ MCFixupKind Kind = (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
+ OpType == AMDGPU::OPERAND_REG_IMM_INT64)
+ ? FK_Data_8
+ : FK_Data_4;
+ addFixup(Fixups, Offset, MO.getExpr(), Kind, PCRel);
}
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index e1b9720cdbfc..bc6fdf7f2e4c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -50,6 +50,7 @@ public:
S_REL64, // symbol@rel64
S_ABS32_LO, // symbol@abs32@lo
S_ABS32_HI, // symbol@abs32@hi
+ S_ABS64, // symbol@abs64
};
private:
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 9b5a46395695..f018f77bc83e 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
default:
return false;
case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
SMovOp = AMDGPU::S_MOV_B32;
break;
case AMDGPU::V_MOV_B64_PSEUDO:
@@ -946,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
// Copies and REG_SEQUENCE do not contribute to the final assembly
// So, skip them but take care of the SGPR to VGPR copies bookkeeping.
- if (Inst->isCopy() || Inst->isRegSequence()) {
- if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
- if (!Inst->isCopy() ||
- !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
- Info.NumSVCopies++;
- continue;
- }
+ if (Inst->isRegSequence() &&
+ TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+ Info.NumSVCopies++;
+ continue;
+ }
+ if (Inst->isCopy()) {
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI);
+ if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) &&
+ !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+ Info.NumSVCopies++;
+ continue;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0ed06c37507a..e172c0b63189 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
+ SetVector<MachineInstr *> ConstantFoldCandidates;
for (FoldCandidate &Fold : FoldList) {
assert(!Fold.isReg() || Fold.Def.OpToFold);
if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
<< static_cast<int>(Fold.UseOpNo) << " of "
<< *Fold.UseMI);
- if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) {
- LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI);
- Changed = true;
- }
+ if (Fold.isImm())
+ ConstantFoldCandidates.insert(Fold.UseMI);
} else if (Fold.Commuted) {
// Restoring instruction's original operand order if fold has failed.
TII->commuteInstruction(*Fold.UseMI, false);
}
}
+
+ for (MachineInstr *MI : ConstantFoldCandidates) {
+ if (tryConstantFoldOp(MI)) {
+ LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
+ Changed = true;
+ }
+ }
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e2a10be4c2c7..0c76ff2ec5ea 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -33,6 +33,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
#include <optional>
using namespace llvm;
+using namespace llvm::SDPatternMatch;
#define DEBUG_TYPE "si-lower"
@@ -938,6 +940,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
}
+ if (Subtarget->hasBF16TransInsts()) {
+ setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
+ }
+
if (Subtarget->hasCvtPkF16F32Inst()) {
setOperationAction(ISD::FP_ROUND,
{MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
@@ -3893,7 +3899,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// arguments to begin at SP+0. Completely unused for non-tail calls.
int32_t FPDiff = 0;
MachineFrameInfo &MFI = MF.getFrameInfo();
- auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ auto *TRI = Subtarget->getRegisterInfo();
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
@@ -8162,6 +8168,14 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
// which is a 64-bit pc-relative offset from the encoding of the $symbol
// operand to the global variable.
+ if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
+ assert(GAFlags != SIInstrInfo::MO_NONE);
+
+ SDValue Ptr =
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
+ return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
+ }
+
SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
SDValue PtrHi;
if (GAFlags == SIInstrInfo::MO_NONE)
@@ -8211,6 +8225,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
}
if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
+ if (Subtarget->has64BitLiterals()) {
+ SDValue Addr = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
+ return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
+ 0);
+ }
+
SDValue AddrLo = DAG.getTargetGlobalAddress(
GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
@@ -9289,7 +9310,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_reloc_constant: {
- Module *M = const_cast<Module *>(MF.getFunction().getParent());
+ Module *M = MF.getFunction().getParent();
const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
auto *RelocSymbol = cast<GlobalVariable>(
@@ -9315,6 +9336,44 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
Op.getOperand(3), IndexKeyi32);
}
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
+ if (Op.getOperand(4).getValueType() == MVT::i64)
+ return SDValue();
+
+ SDLoc SL(Op);
+ auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+ {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
+ Op.getOperand(6)});
+ }
+ case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
+ EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
+ ? MVT::i64
+ : MVT::i32;
+ if (Op.getOperand(6).getValueType() == IndexKeyTy)
+ return SDValue();
+
+ SDLoc SL(Op);
+ auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+ {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+ IndexKey, Op.getOperand(7),
+ Op.getOperand(8)}); // No clamp operand
+ }
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
@@ -11074,7 +11133,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
assert(VT.getSizeInBits() == 64);
SDLoc DL(Op);
- SDValue Cond = Op.getOperand(0);
+ SDValue Cond = DAG.getFreeze(Op.getOperand(0));
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
SDValue One = DAG.getConstant(1, DL, MVT::i32);
@@ -12155,6 +12214,11 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
bitOpWithConstantIsReducible(Opc, ValHi)) ||
(CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
+ // We have 64-bit scalar and/or/xor, but do not have vector forms.
+ if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
+ !CRHS->user_begin()->isDivergent())
+ return SDValue();
+
// If we need to materialize a 64-bit immediate, it will be split up later
// anyway. Avoid creating the harder to understand 64-bit immediate
// materialization.
@@ -13660,6 +13724,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_fdot2:
case Intrinsic::amdgcn_trig_preop:
+ case Intrinsic::amdgcn_tanh:
return true;
default:
break;
@@ -14498,7 +14563,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
// instead of a tree.
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
- assert(N->getOpcode() == ISD::ADD);
+ assert(N->isAnyAdd());
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
@@ -14531,7 +14596,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
for (SDNode *User : LHS->users()) {
// There is a use that does not feed into addition, so the multiply can't
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
- if (User->getOpcode() != ISD::ADD)
+ if (!User->isAnyAdd())
return SDValue();
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14643,8 +14708,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
SDValue Hi = getHiHalf64(LHS, DAG);
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::PTRADD)
+ Opcode = ISD::ADD;
SDValue AddHi =
- DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+ DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -15118,42 +15186,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
+ EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- if (N1.getOpcode() == ISD::ADD) {
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
- // y is not, and (add y, z) is used only once.
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
- // z is not, and (add y, z) is used only once.
- // The goal is to move constant offsets to the outermost ptradd, to create
- // more opportunities to fold offsets into memory instructions.
- // Together with the generic combines in DAGCombiner.cpp, this also
- // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
- //
- // This transform is here instead of in the general DAGCombiner as it can
- // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
- // AArch64's CPA.
- SDValue X = N0;
- SDValue Y = N1.getOperand(0);
- SDValue Z = N1.getOperand(1);
- if (N1.hasOneUse()) {
- bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
- bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
- if (ZIsConstant != YIsConstant) {
- // If both additions in the original were NUW, the new ones are as well.
- SDNodeFlags Flags =
- (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
- if (YIsConstant)
- std::swap(Y, Z);
+ // The following folds transform PTRADDs into regular arithmetic in cases
+ // where the PTRADD wouldn't be folded as an immediate offset into memory
+ // instructions anyway. They are target-specific in that other targets might
+ // prefer to not lose information about the pointer arithmetic.
+
+ // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
+ // Adapted from DAGCombiner::visitADDLikeCommutative.
+ SDValue V, K;
+ if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
+ SDNodeFlags ShlFlags = N1->getFlags();
+ // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
+ // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
+ // preserved.
+ SDNodeFlags NewShlFlags =
+ ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
+ ? SDNodeFlags::NoSignedWrap
+ : SDNodeFlags();
+ SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
+ }
+
+ // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
+ // performAddCombine.
+ if (N1.getOpcode() == ISD::MUL) {
+ if (Subtarget->hasMad64_32()) {
+ if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+ return Folded;
+ }
+ }
+
+ // If the 32 low bits of the constant are all zero, there is nothing to fold
+ // into an immediate offset, so it's better to eliminate the unnecessary
+ // addition for the lower 32 bits than to preserve the PTRADD.
+ // Analogous to a fold in performAddCombine.
+ if (VT == MVT::i64) {
+ if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+ return Folded;
+ }
- SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
+ if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
+ // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
+ // global address GA and constant c, such that c can be folded into GA.
+ SDValue GAValue = N0.getOperand(0);
+ if (const GlobalAddressSDNode *GA =
+ dyn_cast<GlobalAddressSDNode>(GAValue)) {
+ if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
+ // If both additions in the original were NUW, reassociation preserves
+ // that.
+ SDNodeFlags Flags =
+ (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+ SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
DCI.AddToWorklist(Inner.getNode());
- return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+ return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
}
}
}
+ if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
+ return SDValue();
+
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
+ // y is not, and (add y, z) is used only once.
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
+ // z is not, and (add y, z) is used only once.
+ // The goal is to move constant offsets to the outermost ptradd, to create
+ // more opportunities to fold offsets into memory instructions.
+ // Together with the generic combines in DAGCombiner.cpp, this also
+ // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
+ //
+ // This transform is here instead of in the general DAGCombiner as it can
+ // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
+ // AArch64's CPA.
+ SDValue X = N0;
+ SDValue Y = N1.getOperand(0);
+ SDValue Z = N1.getOperand(1);
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+ // If both additions in the original were NUW, reassociation preserves that.
+ SDNodeFlags ReassocFlags =
+ (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+
+ if (ZIsConstant != YIsConstant) {
+ if (YIsConstant)
+ std::swap(Y, Z);
+ SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
+ }
+
+ // If one of Y and Z is constant, they have been handled above. If both were
+ // constant, the addition would have been folded in SelectionDAG::getNode
+ // already. This ensures that the generic DAG combines won't undo the
+ // following reassociation.
+ assert(!YIsConstant && !ZIsConstant);
+
+ if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
+ // y are uniform and z isn't.
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
+ // z are uniform and y isn't.
+ // The goal is to push uniform operands up in the computation, so that they
+ // can be handled with scalar operations. We can't use reassociateScalarOps
+ // for this since it requires two identical commutative operations to
+ // reassociate.
+ if (Y->isDivergent())
+ std::swap(Y, Z);
+ SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(UniformInner.getNode());
+ return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
+ }
+
return SDValue();
}
@@ -16847,12 +16996,63 @@ static void knownBitsForWorkitemID(const GCNSubtarget &ST,
Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
}
+static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT,
+ KnownBits &Known, const APInt &DemandedElts,
+ unsigned BFEWidth, bool SExt, unsigned Depth) {
+ const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo();
+ const MachineOperand &Src1 = MI.getOperand(2);
+
+ unsigned Src1Cst = 0;
+ if (Src1.isImm()) {
+ Src1Cst = Src1.getImm();
+ } else if (Src1.isReg()) {
+ auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
+ if (!Cst)
+ return;
+ Src1Cst = Cst->Value.getZExtValue();
+ } else {
+ return;
+ }
+
+ // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
+ // Width is always [22:16].
+ const unsigned Offset =
+ Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
+ const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
+
+ if (Width >= BFEWidth) // Ill-formed.
+ return;
+
+ VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+ Depth + 1);
+
+ Known = Known.extractBits(Width, Offset);
+
+ if (SExt)
+ Known = Known.sext(BFEWidth);
+ else
+ Known = Known.zext(BFEWidth);
+}
+
void SITargetLowering::computeKnownBitsForTargetInstr(
GISelValueTracking &VT, Register R, KnownBits &Known,
const APInt &DemandedElts, const MachineRegisterInfo &MRI,
unsigned Depth) const {
+ Known.resetAll();
const MachineInstr *MI = MRI.getVRegDef(R);
switch (MI->getOpcode()) {
+ case AMDGPU::S_BFE_I32:
+ return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
+ /*SExt=*/true, Depth);
+ case AMDGPU::S_BFE_U32:
+ return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
+ /*SExt=*/false, Depth);
+ case AMDGPU::S_BFE_I64:
+ return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
+ /*SExt=*/true, Depth);
+ case AMDGPU::S_BFE_U64:
+ return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
+ /*SExt=*/false, Depth);
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7ce1359f03da..2af0a575a888 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
llvm_unreachable("event type has no associated counter");
}
-// This objects maintains the current score brackets of each wait counter, and
-// a per-register scoreboard for each wait counter.
-//
-// We also maintain the latest score for every event type that can change the
-// waitcnt in order to know if there are multiple types of events within
-// the brackets. When multiple types of event happen in the bracket,
-// wait count may get decreased out of order, therefore we need to put in
-// "s_waitcnt 0" before use.
-class WaitcntBrackets {
-public:
- WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
- HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
- InstCounterType SmemAccessCounter)
- : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
- WaitEventMaskForInst(WaitEventMaskForInst),
- SmemAccessCounter(SmemAccessCounter) {}
-
- unsigned getWaitCountMax(InstCounterType T) const {
- switch (T) {
- case LOAD_CNT:
- return Limits.LoadcntMax;
- case DS_CNT:
- return Limits.DscntMax;
- case EXP_CNT:
- return Limits.ExpcntMax;
- case STORE_CNT:
- return Limits.StorecntMax;
- case SAMPLE_CNT:
- return Limits.SamplecntMax;
- case BVH_CNT:
- return Limits.BvhcntMax;
- case KM_CNT:
- return Limits.KmcntMax;
- case X_CNT:
- return Limits.XcntMax;
- default:
- break;
- }
- return 0;
- }
-
- bool isSmemCounter(InstCounterType T) const {
- return T == SmemAccessCounter || T == X_CNT;
- }
-
- unsigned getSgprScoresIdx(InstCounterType T) const {
- assert(isSmemCounter(T) && "Invalid SMEM counter");
- return T == X_CNT ? 1 : 0;
- }
-
- unsigned getScoreLB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreLBs[T];
- }
-
- unsigned getScoreUB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreUBs[T];
- }
-
- unsigned getScoreRange(InstCounterType T) const {
- return getScoreUB(T) - getScoreLB(T);
- }
-
- unsigned getRegScore(int GprNo, InstCounterType T) const {
- if (GprNo < NUM_ALL_VGPRS)
- return VgprScores[T][GprNo];
- return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
- }
-
- bool merge(const WaitcntBrackets &Other);
-
- RegInterval getRegInterval(const MachineInstr *MI,
- const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI,
- const MachineOperand &Op) const;
-
- bool counterOutOfOrder(InstCounterType T) const;
- void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
- void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-
- void determineWait(InstCounterType T, RegInterval Interval,
- AMDGPU::Waitcnt &Wait) const;
- void determineWait(InstCounterType T, int RegNo,
- AMDGPU::Waitcnt &Wait) const {
- determineWait(T, {RegNo, RegNo + 1}, Wait);
- }
-
- void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
- void applyWaitcnt(InstCounterType T, unsigned Count);
- void applyXcnt(const AMDGPU::Waitcnt &Wait);
- void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI, WaitEventType E,
- MachineInstr &MI);
-
- unsigned hasPendingEvent() const { return PendingEvents; }
- unsigned hasPendingEvent(WaitEventType E) const {
- return PendingEvents & (1 << E);
- }
- unsigned hasPendingEvent(InstCounterType T) const {
- unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
- assert((HasPending != 0) == (getScoreRange(T) != 0));
- return HasPending;
- }
-
- bool hasMixedPendingEvents(InstCounterType T) const {
- unsigned Events = hasPendingEvent(T);
- // Return true if more than one bit is set in Events.
- return Events & (Events - 1);
- }
-
- bool hasPendingFlat() const {
- return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
- LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
- (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
- LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
- }
-
- void setPendingFlat() {
- LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
- LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
- }
-
- bool hasPendingGDS() const {
- return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
- }
-
- unsigned getPendingGDSWait() const {
- return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
- }
-
- void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
-
- // Return true if there might be pending writes to the vgpr-interval by VMEM
- // instructions with types different from V.
- bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- if (VgprVmemTypes[RegNo] & ~(1 << V))
- return true;
- }
- return false;
- }
-
- void clearVgprVmemTypes(RegInterval Interval) {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- VgprVmemTypes[RegNo] = 0;
- }
- }
-
- void setStateOnFunctionEntryOrReturn() {
- setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
- PendingEvents |= WaitEventMaskForInst[STORE_CNT];
- }
-
- ArrayRef<const MachineInstr *> getLDSDMAStores() const {
- return LDSDMAStores;
- }
-
- bool hasPointSampleAccel(const MachineInstr &MI) const;
- bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
- RegInterval Interval) const;
-
- void print(raw_ostream &) const;
- void dump() const { print(dbgs()); }
-
-private:
- struct MergeInfo {
- unsigned OldLB;
- unsigned OtherLB;
- unsigned MyShift;
- unsigned OtherShift;
- };
- static bool mergeScore(const MergeInfo &M, unsigned &Score,
- unsigned OtherScore);
-
- void setScoreLB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreLBs[T] = Val;
- }
-
- void setScoreUB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreUBs[T] = Val;
-
- if (T != EXP_CNT)
- return;
-
- if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
- ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
- }
-
- void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
- setScoreByInterval({GprNo, GprNo + 1}, T, Val);
- }
-
- void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
- unsigned Score);
-
- void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
- const MachineOperand &Op, InstCounterType CntTy,
- unsigned Val);
-
- const GCNSubtarget *ST = nullptr;
- InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
- HardwareLimits Limits = {};
- const unsigned *WaitEventMaskForInst;
- InstCounterType SmemAccessCounter;
- unsigned ScoreLBs[NUM_INST_CNTS] = {0};
- unsigned ScoreUBs[NUM_INST_CNTS] = {0};
- unsigned PendingEvents = 0;
- // Remember the last flat memory operation.
- unsigned LastFlat[NUM_INST_CNTS] = {0};
- // Remember the last GDS operation.
- unsigned LastGDS = 0;
- // wait_cnt scores for every vgpr.
- // Keep track of the VgprUB and SgprUB to make merge at join efficient.
- int VgprUB = -1;
- int SgprUB = -1;
- unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
- // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
- // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
- // X_CNT score.
- unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
- // Bitmask of the VmemTypes of VMEM instructions that might have a pending
- // write to each vgpr.
- unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
- // Store representative LDS DMA operations. The only useful info here is
- // alias info. One store is kept per unique AAInfo.
- SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
-};
+class WaitcntBrackets;
// This abstracts the logic for generating and updating S_WAIT* instructions
// away from the analysis that determines where they are needed. This was
@@ -640,8 +407,13 @@ public:
};
class SIInsertWaitcnts {
+public:
+ const GCNSubtarget *ST;
+ InstCounterType SmemAccessCounter;
+ InstCounterType MaxCounter;
+ const unsigned *WaitEventMaskForInst;
+
private:
- const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;
@@ -657,8 +429,6 @@ private:
bool Dirty = true;
};
- InstCounterType SmemAccessCounter;
-
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
bool ForceEmitWaitcnt[NUM_INST_CNTS];
@@ -675,7 +445,7 @@ private:
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;
- InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+ HardwareLimits Limits;
public:
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -686,6 +456,30 @@ public:
(void)ForceVMCounter;
}
+ unsigned getWaitCountMax(InstCounterType T) const {
+ switch (T) {
+ case LOAD_CNT:
+ return Limits.LoadcntMax;
+ case DS_CNT:
+ return Limits.DscntMax;
+ case EXP_CNT:
+ return Limits.ExpcntMax;
+ case STORE_CNT:
+ return Limits.StorecntMax;
+ case SAMPLE_CNT:
+ return Limits.SamplecntMax;
+ case BVH_CNT:
+ return Limits.BvhcntMax;
+ case KM_CNT:
+ return Limits.KmcntMax;
+ case X_CNT:
+ return Limits.XcntMax;
+ default:
+ break;
+ }
+ return 0;
+ }
+
bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
bool isPreheaderToFlush(MachineBasicBlock &MBB,
const WaitcntBrackets &ScoreBrackets);
@@ -791,6 +585,211 @@ public:
WaitcntBrackets &ScoreBrackets);
};
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class WaitcntBrackets {
+public:
+ WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
+
+ bool isSmemCounter(InstCounterType T) const {
+ return T == Context->SmemAccessCounter || T == X_CNT;
+ }
+
+ unsigned getSgprScoresIdx(InstCounterType T) const {
+ assert(isSmemCounter(T) && "Invalid SMEM counter");
+ return T == X_CNT ? 1 : 0;
+ }
+
+ unsigned getScoreLB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreLBs[T];
+ }
+
+ unsigned getScoreUB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreUBs[T];
+ }
+
+ unsigned getScoreRange(InstCounterType T) const {
+ return getScoreUB(T) - getScoreLB(T);
+ }
+
+ unsigned getRegScore(int GprNo, InstCounterType T) const {
+ if (GprNo < NUM_ALL_VGPRS)
+ return VgprScores[T][GprNo];
+ return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+ }
+
+ bool merge(const WaitcntBrackets &Other);
+
+ RegInterval getRegInterval(const MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ const SIRegisterInfo *TRI,
+ const MachineOperand &Op) const;
+
+ bool counterOutOfOrder(InstCounterType T) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+
+ void determineWait(InstCounterType T, RegInterval Interval,
+ AMDGPU::Waitcnt &Wait) const;
+ void determineWait(InstCounterType T, int RegNo,
+ AMDGPU::Waitcnt &Wait) const {
+ determineWait(T, {RegNo, RegNo + 1}, Wait);
+ }
+
+ void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+ void applyWaitcnt(InstCounterType T, unsigned Count);
+ void applyXcnt(const AMDGPU::Waitcnt &Wait);
+ void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI, WaitEventType E,
+ MachineInstr &MI);
+
+ unsigned hasPendingEvent() const { return PendingEvents; }
+ unsigned hasPendingEvent(WaitEventType E) const {
+ return PendingEvents & (1 << E);
+ }
+ unsigned hasPendingEvent(InstCounterType T) const {
+ unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
+ assert((HasPending != 0) == (getScoreRange(T) != 0));
+ return HasPending;
+ }
+
+ bool hasMixedPendingEvents(InstCounterType T) const {
+ unsigned Events = hasPendingEvent(T);
+ // Return true if more than one bit is set in Events.
+ return Events & (Events - 1);
+ }
+
+ bool hasPendingFlat() const {
+ return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+ LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+ (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+ LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
+ }
+
+ void setPendingFlat() {
+ LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+ LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
+ }
+
+ bool hasPendingGDS() const {
+ return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
+ }
+
+ unsigned getPendingGDSWait() const {
+ return std::min(getScoreUB(DS_CNT) - LastGDS,
+ Context->getWaitCountMax(DS_CNT) - 1);
+ }
+
+ void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
+
+ // Return true if there might be pending writes to the vgpr-interval by VMEM
+ // instructions with types different from V.
+ bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ assert(RegNo < NUM_ALL_VGPRS);
+ if (VgprVmemTypes[RegNo] & ~(1 << V))
+ return true;
+ }
+ return false;
+ }
+
+ void clearVgprVmemTypes(RegInterval Interval) {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ assert(RegNo < NUM_ALL_VGPRS);
+ VgprVmemTypes[RegNo] = 0;
+ }
+ }
+
+ void setStateOnFunctionEntryOrReturn() {
+ setScoreUB(STORE_CNT,
+ getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
+ PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
+ }
+
+ ArrayRef<const MachineInstr *> getLDSDMAStores() const {
+ return LDSDMAStores;
+ }
+
+ bool hasPointSampleAccel(const MachineInstr &MI) const;
+ bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+ RegInterval Interval) const;
+
+ void print(raw_ostream &) const;
+ void dump() const { print(dbgs()); }
+
+private:
+ struct MergeInfo {
+ unsigned OldLB;
+ unsigned OtherLB;
+ unsigned MyShift;
+ unsigned OtherShift;
+ };
+ static bool mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore);
+
+ void setScoreLB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreLBs[T] = Val;
+ }
+
+ void setScoreUB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreUBs[T] = Val;
+
+ if (T != EXP_CNT)
+ return;
+
+ if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
+ }
+
+ void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
+ setScoreByInterval({GprNo, GprNo + 1}, T, Val);
+ }
+
+ void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
+ unsigned Score);
+
+ void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI,
+ const MachineOperand &Op, InstCounterType CntTy,
+ unsigned Val);
+
+ const SIInsertWaitcnts *Context;
+
+ unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+ unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+ unsigned PendingEvents = 0;
+ // Remember the last flat memory operation.
+ unsigned LastFlat[NUM_INST_CNTS] = {0};
+ // Remember the last GDS operation.
+ unsigned LastGDS = 0;
+ // wait_cnt scores for every vgpr.
+ // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+ int VgprUB = -1;
+ int SgprUB = -1;
+ unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
+ // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+ // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+ // X_CNT score.
+ unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+ // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+ // write to each vgpr.
+ unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+ // Store representative LDS DMA operations. The only useful info here is
+ // alias info. One store is kept per unique AAInfo.
+ SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
+};
+
class SIInsertWaitcntsLegacy : public MachineFunctionPass {
public:
static char ID;
@@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
RegInterval Result;
- MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST);
+ MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
unsigned RegIdx = TRI->getHWRegIndex(MCReg);
assert(isUInt<8>(RegIdx));
@@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
// this at compile time, so we have to assume it might be applied if the
// instruction supports it).
bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
- if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
+ if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
return false;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
@@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
WaitEventType E, MachineInstr &Inst) {
- InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+ InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
unsigned UB = getScoreUB(T);
unsigned CurrScore = UB + 1;
@@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
void WaitcntBrackets::print(raw_ostream &OS) const {
+ const GCNSubtarget *ST = Context->ST;
+
OS << '\n';
- for (auto T : inst_counter_types(MaxCounter)) {
+ for (auto T : inst_counter_types(Context->MaxCounter)) {
unsigned SR = getScoreRange(T);
switch (T) {
@@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
// s_waitcnt instruction.
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
- !ST->hasFlatLgkmVMemCountInOrder()) {
+ !Context->ST->hasFlatLgkmVMemCountInOrder()) {
// If there is a pending FLAT operation, and this is a VMem or LGKM
// waitcnt and the target can report early completion, then we need
// to force a waitcnt 0.
@@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
// If a counter has been maxed out avoid overflow by waiting for
// MAX(CounterType) - 1 instead.
unsigned NeededWait =
- std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+ std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
addWait(Wait, T, NeededWait);
}
}
@@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
} else {
setScoreLB(T, UB);
- PendingEvents &= ~WaitEventMaskForInst[T];
+ PendingEvents &= ~Context->WaitEventMaskForInst[T];
}
}
@@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
// the decrement may go out of order.
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.
- if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+ if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
(T == X_CNT && hasPendingEvent(SMEM_GROUP)))
return true;
return hasMixedPendingEvents(T);
@@ -2386,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
VgprUB = std::max(VgprUB, Other.VgprUB);
SgprUB = std::max(SgprUB, Other.SgprUB);
- for (auto T : inst_counter_types(MaxCounter)) {
+ for (auto T : inst_counter_types(Context->MaxCounter)) {
// Merge event flags for this counter
+ const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
if (OtherEvents & ~OldEvents)
@@ -2746,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
for (auto T : inst_counter_types())
ForceEmitWaitcnt[T] = false;
- const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+ WaitEventMaskForInst = WCG->getWaitEventMask();
SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
- HardwareLimits Limits = {};
if (ST->hasExtendedWaitCounts()) {
Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
@@ -2807,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
}
- auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
NonKernelInitialState->setStateOnFunctionEntryOrReturn();
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
@@ -2839,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
*Brackets = *BI.Incoming;
} else {
if (!Brackets) {
- Brackets = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ Brackets = std::make_unique<WaitcntBrackets>(this);
} else {
// Reinitialize in-place. N.B. do not do this by assigning from a
// temporary because the WaitcntBrackets class is large and it could
// cause this function to use an unreasonable amount of stack space.
Brackets->~WaitcntBrackets();
- new (Brackets.get()) WaitcntBrackets(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ new (Brackets.get()) WaitcntBrackets(this);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ca3af3b48a60..c8935f0cb603 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -687,7 +687,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
if (!SafeToPropagate)
break;
- DefOp.setIsKill(false);
+ for (auto I = Def; I != MI; ++I)
+ I->clearRegisterKills(DefOp.getReg(), &RI);
}
MachineInstrBuilder Builder =
@@ -1625,41 +1626,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
}
}
-static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
- switch (Size) {
- case 4:
- return AMDGPU::SI_SPILL_A32_SAVE;
- case 8:
- return AMDGPU::SI_SPILL_A64_SAVE;
- case 12:
- return AMDGPU::SI_SPILL_A96_SAVE;
- case 16:
- return AMDGPU::SI_SPILL_A128_SAVE;
- case 20:
- return AMDGPU::SI_SPILL_A160_SAVE;
- case 24:
- return AMDGPU::SI_SPILL_A192_SAVE;
- case 28:
- return AMDGPU::SI_SPILL_A224_SAVE;
- case 32:
- return AMDGPU::SI_SPILL_A256_SAVE;
- case 36:
- return AMDGPU::SI_SPILL_A288_SAVE;
- case 40:
- return AMDGPU::SI_SPILL_A320_SAVE;
- case 44:
- return AMDGPU::SI_SPILL_A352_SAVE;
- case 48:
- return AMDGPU::SI_SPILL_A384_SAVE;
- case 64:
- return AMDGPU::SI_SPILL_A512_SAVE;
- case 128:
- return AMDGPU::SI_SPILL_A1024_SAVE;
- default:
- llvm_unreachable("unknown register size");
- }
-}
-
static unsigned getAVSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
@@ -1707,22 +1673,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
return AMDGPU::SI_SPILL_WWM_V32_SAVE;
}
-static unsigned getVectorRegSpillSaveOpcode(Register Reg,
- const TargetRegisterClass *RC,
- unsigned Size,
- const SIRegisterInfo &TRI,
- const SIMachineFunctionInfo &MFI) {
- bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
+ Register Reg, const TargetRegisterClass *RC, unsigned Size,
+ const SIMachineFunctionInfo &MFI) const {
+ bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
// Choose the right opcode if spilling a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
- if (IsVectorSuperClass)
+ // TODO: Check if AGPRs are available
+ if (ST.hasMAIInsts())
return getAVSpillSaveOpcode(Size);
- return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
- : getVGPRSpillSaveOpcode(Size);
+ return getVGPRSpillSaveOpcode(Size);
}
void SIInstrInfo::storeRegToStackSlot(
@@ -1770,8 +1734,8 @@ void SIInstrInfo::storeRegToStackSlot(
return;
}
- unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
- SpillSize, RI, *MFI);
+ unsigned Opcode =
+ getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
@@ -1854,41 +1818,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
}
}
-static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
- switch (Size) {
- case 4:
- return AMDGPU::SI_SPILL_A32_RESTORE;
- case 8:
- return AMDGPU::SI_SPILL_A64_RESTORE;
- case 12:
- return AMDGPU::SI_SPILL_A96_RESTORE;
- case 16:
- return AMDGPU::SI_SPILL_A128_RESTORE;
- case 20:
- return AMDGPU::SI_SPILL_A160_RESTORE;
- case 24:
- return AMDGPU::SI_SPILL_A192_RESTORE;
- case 28:
- return AMDGPU::SI_SPILL_A224_RESTORE;
- case 32:
- return AMDGPU::SI_SPILL_A256_RESTORE;
- case 36:
- return AMDGPU::SI_SPILL_A288_RESTORE;
- case 40:
- return AMDGPU::SI_SPILL_A320_RESTORE;
- case 44:
- return AMDGPU::SI_SPILL_A352_RESTORE;
- case 48:
- return AMDGPU::SI_SPILL_A384_RESTORE;
- case 64:
- return AMDGPU::SI_SPILL_A512_RESTORE;
- case 128:
- return AMDGPU::SI_SPILL_A1024_RESTORE;
- default:
- llvm_unreachable("unknown register size");
- }
-}
-
static unsigned getAVSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
@@ -1930,27 +1859,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
if (Size != 4)
llvm_unreachable("unknown wwm register spill size");
- if (IsVectorSuperClass)
+ if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
}
-static unsigned
-getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
- unsigned Size, const SIRegisterInfo &TRI,
- const SIMachineFunctionInfo &MFI) {
- bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
+ Register Reg, const TargetRegisterClass *RC, unsigned Size,
+ const SIMachineFunctionInfo &MFI) const {
+ bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
// Choose the right opcode if restoring a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
- if (IsVectorSuperClass)
+ // TODO: Check if AGPRs are available
+ if (ST.hasMAIInsts())
return getAVSpillRestoreOpcode(Size);
- return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
- : getVGPRSpillRestoreOpcode(Size);
+ assert(!RI.isAGPRClass(RC));
+ return getVGPRSpillRestoreOpcode(Size);
}
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -1998,7 +1927,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
}
unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
- SpillSize, RI, *MFI);
+ SpillSize, *MFI);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
@@ -2214,7 +2143,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
if (ST.hasMovB64()) {
MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
- isUInt<32>(SrcOp.getImm()))
+ isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
break;
}
if (SrcOp.isImm()) {
@@ -2273,6 +2202,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
const MachineOperand &SrcOp = MI.getOperand(1);
assert(!SrcOp.isFPImm());
+
+ if (ST.has64BitLiterals()) {
+ MI.setDesc(get(AMDGPU::S_MOV_B64));
+ break;
+ }
+
APInt Imm(64, SrcOp.getImm());
if (Imm.isIntN(32) || isInlineConstant(Imm)) {
MI.setDesc(get(AMDGPU::S_MOV_B64));
@@ -2492,6 +2427,25 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
+ MachineFunction &MF = *MBB.getParent();
+ Register Reg = MI.getOperand(0).getReg();
+ MachineOperand Op = MI.getOperand(1);
+
+ // Create a bundle so these instructions won't be re-ordered by the
+ // post-RA scheduler.
+ MIBundleBuilder Bundler(MBB, MI);
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+ if (Op.isGlobal())
+ Op.setOffset(Op.getOffset() + 4);
+ Bundler.append(
+ BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
+
+ finalizeBundle(MBB, Bundler.begin());
+
+ MI.eraseFromParent();
+ break;
+ }
case AMDGPU::ENTER_STRICT_WWM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
// Whole Wave Mode is entered.
@@ -2807,12 +2761,14 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
if (!DefinedRC1)
return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
- return isLegalRegOperand(MI, OpIdx1, *MO0);
+ return isLegalRegOperand(MI, OpIdx1, *MO0) &&
+ (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1));
}
if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
if (!DefinedRC0)
return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
- return isLegalRegOperand(MI, OpIdx0, *MO1);
+ return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) &&
+ isLegalRegOperand(MI, OpIdx0, *MO1);
}
// No need to check 64-bit literals since swapping does not bring new
@@ -2903,9 +2859,9 @@ bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
int64_t BrOffset) const {
- // BranchRelaxation should never have to check s_setpc_b64 because its dest
- // block is unanalyzable.
- assert(BranchOp != AMDGPU::S_SETPC_B64);
+ // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
+ // because its dest block is unanalyzable.
+ assert(isSOPP(BranchOp) || isSOPK(BranchOp));
// Convert to dwords.
BrOffset /= 4;
@@ -2946,13 +2902,30 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ auto I = MBB.end();
+ auto &MCCtx = MF->getContext();
+
+ if (ST.hasAddPC64Inst()) {
+ MCSymbol *Offset =
+ MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
+ auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
+ .addSym(Offset, MO_FAR_BRANCH_OFFSET);
+ MCSymbol *PostAddPCLabel =
+ MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
+ AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
+ auto *OffsetExpr = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
+ MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
+ Offset->setVariableValue(OffsetExpr);
+ return;
+ }
+
+ assert(RS && "RegScavenger required for long branching");
// FIXME: Virtual register workaround for RegScavenger not working with empty
// blocks.
Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- auto I = MBB.end();
-
// Note: as this is used after hazard recognizer we need to apply some hazard
// workarounds directly.
const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
@@ -2968,7 +2941,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
ApplyHazardWorkarounds();
- auto &MCCtx = MF->getContext();
MCSymbol *PostGetPCLabel =
MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
@@ -3507,6 +3479,10 @@ static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
? AMDGPU::V_FMAAK_F16_t16
: AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16;
+ case AMDGPU::V_FMAC_F64_e32:
+ case AMDGPU::V_FMAC_F64_e64:
+ case AMDGPU::V_FMA_F64_e64:
+ return AMDGPU::V_FMAAK_F64;
default:
llvm_unreachable("invalid instruction");
}
@@ -3535,6 +3511,10 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
? AMDGPU::V_FMAMK_F16_t16
: AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16;
+ case AMDGPU::V_FMAC_F64_e32:
+ case AMDGPU::V_FMAC_F64_e64:
+ case AMDGPU::V_FMA_F64_e64:
+ return AMDGPU::V_FMAMK_F64;
default:
llvm_unreachable("invalid instruction");
}
@@ -3613,7 +3593,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
+ Opc == AMDGPU::V_FMAC_F64_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
@@ -3685,7 +3666,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3753,7 +3735,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -4074,8 +4057,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
- if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
- !IsLegacy &&
+ if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
+ (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
@@ -6099,14 +6082,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
if (Is64BitOp &&
!AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
- if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
+ if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
+ (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
return false;
// FIXME: We can use sign extended 64-bit literals, but only for signed
// operands. At the moment we do not know if an operand is signed.
// Such operand will be encoded as its low 32 bits and then either
// correctly sign extended or incorrectly zero extended by HW.
- if (!Is64BitFPOp && (int32_t)Imm < 0)
+ // If 64-bit literals are supported and the literal will be encoded
+ // as full 64 bit we still can use it.
+ if (!Is64BitFPOp && (int32_t)Imm < 0 &&
+ (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
return false;
}
}
@@ -6402,7 +6389,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
if (OldSAddrIdx < 0)
return false;
- assert(isSegmentSpecificFLAT(Inst));
+ assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
if (NewOpc < 0)
@@ -6426,7 +6413,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
if (OldVAddrIdx >= 0) {
MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
- if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
+ if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
!VAddrDef->getOperand(1).isImm() ||
VAddrDef->getOperand(1).getImm() != 0)
return false;
@@ -6479,7 +6466,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
// FIXME: Remove this when SelectionDAG is obsoleted.
void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
- if (!isSegmentSpecificFLAT(MI))
+ if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
return;
// Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
@@ -9178,15 +9165,30 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
if (isDPP(MI))
return DescSize;
bool HasLiteral = false;
+ unsigned LiteralSize = 4;
for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
const MachineOperand &Op = MI.getOperand(I);
const MCOperandInfo &OpInfo = Desc.operands()[I];
if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
HasLiteral = true;
+ if (ST.has64BitLiterals()) {
+ switch (OpInfo.OperandType) {
+ default:
+ break;
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
+ LiteralSize = 8;
+ break;
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
+ LiteralSize = 8;
+ break;
+ }
+ }
break;
}
}
- return HasLiteral ? DescSize + 4 : DescSize;
+ return HasLiteral ? DescSize + LiteralSize : DescSize;
}
// Check whether we have extra NSA words.
@@ -9277,13 +9279,16 @@ SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
ArrayRef<std::pair<unsigned, const char *>>
SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
static const std::pair<unsigned, const char *> TargetFlags[] = {
- { MO_GOTPCREL, "amdgpu-gotprel" },
- { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
- { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
- { MO_REL32_LO, "amdgpu-rel32-lo" },
- { MO_REL32_HI, "amdgpu-rel32-hi" },
- { MO_ABS32_LO, "amdgpu-abs32-lo" },
- { MO_ABS32_HI, "amdgpu-abs32-hi" },
+ {MO_GOTPCREL, "amdgpu-gotprel"},
+ {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
+ {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
+ {MO_GOTPCREL64, "amdgpu-gotprel64"},
+ {MO_REL32_LO, "amdgpu-rel32-lo"},
+ {MO_REL32_HI, "amdgpu-rel32-hi"},
+ {MO_REL64, "amdgpu-rel64"},
+ {MO_ABS32_LO, "amdgpu-abs32-lo"},
+ {MO_ABS32_HI, "amdgpu-abs32-hi"},
+ {MO_ABS64, "amdgpu-abs64"},
};
return ArrayRef(TargetFlags);
@@ -10390,10 +10395,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
return TargetInstrInfo::isGlobalMemoryObject(MI);
}
+bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
+ if (!isWMMA(MI) && !isSWMMAC(MI))
+ return false;
+
+ if (AMDGPU::isGFX1250(ST))
+ return AMDGPU::getWMMAIsXDL(MI.getOpcode());
+
+ return true;
+}
+
bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
- if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) ||
+ if (AMDGPU::isGFX12Plus(ST))
+ return isDOT(MI) || isXDLWMMA(MI);
+
+ if (!isMAI(MI) || isDGEMM(Opcode) ||
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 9e84822bfc27..5e92921f3ea2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -33,6 +33,7 @@ class LiveVariables;
class MachineDominatorTree;
class MachineRegisterInfo;
class RegScavenger;
+class SIMachineFunctionInfo;
class TargetRegisterClass;
class ScheduleHazardRecognizer;
@@ -214,16 +215,20 @@ public:
MO_GOTPCREL32_LO = 2,
// MO_GOTPCREL32_HI -> symbol@gotpcrel32@hi -> R_AMDGPU_GOTPCREL32_HI.
MO_GOTPCREL32_HI = 3,
+ // MO_GOTPCREL64 -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
+ MO_GOTPCREL64 = 4,
// MO_REL32_LO -> symbol@rel32@lo -> R_AMDGPU_REL32_LO.
- MO_REL32 = 4,
- MO_REL32_LO = 4,
+ MO_REL32 = 5,
+ MO_REL32_LO = 5,
// MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
- MO_REL32_HI = 5,
+ MO_REL32_HI = 6,
+ MO_REL64 = 7,
- MO_FAR_BRANCH_OFFSET = 6,
+ MO_FAR_BRANCH_OFFSET = 8,
- MO_ABS32_LO = 8,
- MO_ABS32_HI = 9,
+ MO_ABS32_LO = 9,
+ MO_ABS32_HI = 10,
+ MO_ABS64 = 11,
};
explicit SIInstrInfo(const GCNSubtarget &ST);
@@ -283,6 +288,15 @@ public:
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
int64_t &ImmVal) const override;
+ unsigned getVectorRegSpillSaveOpcode(Register Reg,
+ const TargetRegisterClass *RC,
+ unsigned Size,
+ const SIMachineFunctionInfo &MFI) const;
+ unsigned
+ getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
+ unsigned Size,
+ const SIMachineFunctionInfo &MFI) const;
+
void storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIndex, const TargetRegisterClass *RC,
@@ -863,6 +877,8 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
}
+ bool isXDLWMMA(const MachineInstr &MI) const;
+
bool isXDL(const MachineInstr &MI) const;
static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); }
@@ -1097,7 +1113,6 @@ public:
// that will not require an additional 4-bytes; this function assumes that it
// will.
bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const {
- assert(!MO.isReg() && "isInlineConstant called on register operand!");
if (!MO.isImm())
return false;
return isInlineConstant(MO.getImm(), OperandType);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 5e41f875d980..9e1951e2946c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -268,6 +268,10 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
+def SIpc_add_rel_offset64 : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET64",
+ SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
+>;
+
def SIlds : SDNode<"AMDGPUISD::LDS",
SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
>;
@@ -1247,6 +1251,7 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
+def IndexKey32bit : CustomOperand<i32, 1>;
def IndexKey16bit : CustomOperand<i32, 1>;
def IndexKey8bit : CustomOperand<i32, 1>;
@@ -1302,6 +1307,9 @@ let PrintMethod = "printBitOp3" in
def BitOp3 : NamedIntOperand<"bitop3">;
def bitop3_0 : DefaultOperand<BitOp3, 0>;
+def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
+def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
+
class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_KIMM"#vt.Size;
@@ -1633,6 +1641,8 @@ def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
+def VOP3PModsNegs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegs">; // chfang: not use complex pattern?
+def VOP3PModsNegAbs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegAbs">;
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
@@ -1641,6 +1651,7 @@ def WMMAModsF16NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">;
def WMMAVISrc : ComplexPattern<untyped, 1, "SelectWMMAVISrc">;
def SWMMACIndex8 : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">;
def SWMMACIndex16 : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">;
+def SWMMACIndex32 : ComplexPattern<untyped, 2, "SelectSWMMACIndex32">;
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
@@ -2654,6 +2665,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
isModifierType<Src2VT>.ret,
HasOMod);
field bit HasNeg = HasModifiers;
+ field bit HasMatrixReuse = 0;
field bit HasSrc0Mods = HasModifiers;
field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
@@ -2837,6 +2849,8 @@ def VOP_F16_F16 : VOPProfile<[f16, f16, untyped, untyped]>;
def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
+def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;
+def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>;
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4419ce00b473..991d9f83e92e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1144,6 +1144,14 @@ def : GCNPat <
(SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
>;
+def SI_PC_ADD_REL_OFFSET64 : SPseudoInstSI <
+ (outs SReg_64:$dst),
+ (ins si_ga:$ptr),
+ [(set SReg_64:$dst,
+ (i64 (SIpc_add_rel_offset64 tglobaladdr:$ptr)))]> {
+ let SubtargetPredicate = Has64BitLiterals;
+}
+
def : GCNPat<
(AMDGPUtrap timm:$trapid),
(S_TRAP $trapid)
@@ -2465,7 +2473,6 @@ def : AMDGPUPat <
>;
let True16Predicate = NotHasTrue16BitInsts in {
-let SubtargetPredicate = isNotGFX9Plus in {
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
@@ -2475,35 +2482,6 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
-} // isNotGFX9Plus
-
-let SubtargetPredicate = isGFX9GFX10 in {
-def : GCNPat <
- (rotr i32:$src0, i32:$src1),
- (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src0,
- /* src2_modifiers */ 0,
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
-foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
- (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
-def : GCNPat<pat,
- (V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */
- (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
- 0, /* src1_modifiers */
- (i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
- 0, /* src2_modifiers */
- $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
-def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
- (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
- /* src1_modifiers */ 0, $src1,
- /* src2_modifiers */ 0,
- $src2, /* clamp */ 0, /* op_sel */ 0)
->;
-} // isGFX9GFX10
} // end True16Predicate = NotHasTrue16BitInsts
let True16Predicate = UseRealTrue16Insts in {
@@ -3104,8 +3082,6 @@ def : GCNPat <
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
-// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped
-// to V_PERM_B32.
let True16Predicate = NotHasTrue16BitInsts in
def : GCNPat <
(i32 (bswap i32:$a)),
@@ -3451,30 +3427,32 @@ def : GCNPat <
(S_LSHL_B32 SReg_32:$src1, (i16 16))
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
(v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
>;
-
def : GCNPat <
- (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
- (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+ (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
+ (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
def : GCNPat <
- (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
- (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
+ (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
+}
def : GCNPat <
- (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
def : GCNPat <
- (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
- (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
foreach vecTy = [v2i16, v2f16, v2bf16] in {
@@ -3581,20 +3559,15 @@ def : GCNPat <
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
-let True16Predicate = NotHasTrue16BitInsts in {
-defvar BuildVectorToAlignBitPat =
+let True16Predicate = NotHasTrue16BitInsts in
+def : GCNPat <
(vecTy (DivergentBinFrag<build_vector>
(Ty !if(!eq(Ty, i16),
(Ty (trunc (srl VGPR_32:$a, (i32 16)))),
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
- (Ty VGPR_32:$b)));
-
-let SubtargetPredicate = isNotGFX9Plus in
-def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))>;
-
-let SubtargetPredicate = isGFX9GFX10 in
-def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_opsel_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i32 16), 0, 0)>;
-} //True16Predicate = NotHasTrue16BitInsts
+ (Ty VGPR_32:$b))),
+ (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))
+>;
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b0d6fd95cd27..5097ac03954d 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2225,8 +2225,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = MI.getIterator();
++MBBI;
- const SITargetLowering *TLI =
- static_cast<const SITargetLowering *>(STM->getTargetLowering());
+ const SITargetLowering *TLI = STM->getTargetLowering();
for ( ; MBBI != E; ++MBBI) {
MachineInstr &MINext = *MBBI;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 67ad28661da4..75ce67c00228 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -42,7 +42,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
- const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
+ const GCNSubtarget &ST = *STI;
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9173041a7bcc..fa2b8db6ba55 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4052,11 +4052,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
return 0;
}
-unsigned
-SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
- const TargetRegisterClass &RC) const {
+unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass &RC,
+ bool IncludeCalls) const {
for (MCPhysReg Reg : reverse(RC.getRegisters()))
- if (MRI.isPhysRegUsed(Reg))
+ if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
return getHWRegIndex(Reg) + 1;
return 0;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 06a7a17b0246..0008e5f8cf3b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,9 +486,11 @@ public:
unsigned SubReg) const;
// \returns a number of registers of a given \p RC used in a function.
- // Does not go inside function calls.
+ // Does not go inside function calls. If \p IncludeCalls is true, it will
+ // include registers that may be clobbered by calls.
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
- const TargetRegisterClass &RC) const;
+ const TargetRegisterClass &RC,
+ bool IncludeCalls = true) const;
std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index d24c301fc1e5..c194e5c255d4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1294,6 +1294,7 @@ def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP32">;
def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP64">;
def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_INT32">;
def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_512_f64 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP64">;
def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_INT32">;
def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_FP32">;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 1679cee32006..ef8faffa5f55 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -66,6 +66,13 @@ def Write4PassDGEMM : SchedWrite;
def Write8PassDGEMM : SchedWrite;
def Write16PassDGEMM : SchedWrite;
+// WMMA/SWMMA instructions
+def WriteXDL2PassWMMA : SchedWrite;
+def WriteXDL4PassWMMA : SchedWrite;
+def Write4PassWMMA : SchedWrite;
+def Write8PassWMMA : SchedWrite;
+def Write16PassWMMA : SchedWrite;
+
// Scalar float instructions
def WriteSFPU : SchedWrite;
@@ -459,6 +466,15 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
multiclass GFX125xCommonWriteRes {
+let ReleaseAtCycles = [8] in
+def : HWWriteRes<WriteXDL2PassWMMA, [HWXDL], 8>;
+let ReleaseAtCycles = [16] in
+def : HWWriteRes<WriteXDL4PassWMMA, [HWXDL], 16>;
+
+def : HWWriteRes<Write4PassWMMA, [HWVALU], 16>;
+def : HWWriteRes<Write8PassWMMA, [HWVALU], 32>;
+def : HWWriteRes<Write16PassWMMA, [HWVALU], 64>;
+
def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 7>;
@@ -476,6 +492,11 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
def : InstRW<[WriteCopy], (instrs COPY)>;
+
+def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
+def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
+def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>;
+def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
} // End GFX125xCommonWriteRes
let SchedModel = GFX1250SpeedModel in {
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index fd39b8a1350c..7a519117f248 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -463,6 +463,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
break;
+ case AMDGPU::V_FMA_F64_e64:
+ if (ST->hasFmaakFmamkF64Insts())
+ NewOpcode = AMDGPU::V_FMAAK_F64;
+ break;
}
}
@@ -497,6 +501,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
break;
+ case AMDGPU::V_FMA_F64_e64:
+ if (ST->hasFmaakFmamkF64Insts())
+ NewOpcode = AMDGPU::V_FMAMK_F64;
+ break;
}
}
@@ -961,7 +969,9 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
- MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 ||
+ (MI.getOpcode() == AMDGPU::V_FMA_F64_e64 &&
+ ST->hasFmaakFmamkF64Insts())) {
shrinkMadFma(MI);
continue;
}
@@ -1058,7 +1068,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
// fold an immediate into the shrunk instruction as a literal operand. In
// GFX10 VOP3 instructions can take a literal operand anyway, so there is
// no advantage to doing this.
- if (ST->hasVOP3Literal() && !IsPostRA)
+ // However, if 64-bit literals are allowed we still need to shrink it
+ // for such literal to be able to fold.
+ if (ST->hasVOP3Literal() &&
+ (!ST->has64BitLiterals() || AMDGPU::isTrue16Inst(MI.getOpcode())) &&
+ !IsPostRA)
continue;
if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2472b76fcf02..e103ccc2f00e 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -154,6 +154,10 @@ class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
let has_sdst = 0;
}
+class SOP1_1_REGIMM64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs), (ins SSrc_b64:$src0), "$src0", pattern> {
+ let has_sdst = 0;
+}
class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0),
@@ -317,6 +321,9 @@ let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
let isBranch = 1, isIndirectBranch = 1 in {
def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
+
+let SubtargetPredicate = HasAddPC64Inst in
+def S_ADD_PC_I64 : SOP1_1_REGIMM64 <"s_add_pc_i64">;
} // End isBranch = 1, isIndirectBranch = 1
let isReturn = 1 in {
@@ -2130,6 +2137,9 @@ defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>;
defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>;
defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>;
+// GFX1250
+defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>;
+
//===----------------------------------------------------------------------===//
// SOP1 - GFX1150, GFX12
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a32078cc403e..77258810dd68 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) {
#define GET_MIMGOffsetMappingTable_IMPL
#define GET_MIMGG16MappingTable_IMPL
#define GET_MAIInstInfoTable_IMPL
+#define GET_WMMAInstInfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
return Info && Info->is_gfx940_xdl;
}
+bool getWMMAIsXDL(unsigned Opc) {
+ const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc);
+ return Info ? Info->is_wmma_xdl : false;
+}
+
uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) {
switch (EncodingVal) {
case MFMAScaleFormats::FP6_E2M3:
@@ -639,6 +645,7 @@ bool isMAC(unsigned Opc) {
Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
Opc == AMDGPU::V_MAC_F16_e64_vi ||
Opc == AMDGPU::V_FMAC_F64_e64_gfx90a ||
+ Opc == AMDGPU::V_FMAC_F64_e64_gfx12 ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx11 ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx12 ||
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6708e0a3f454..c9d2c286bf23 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -119,6 +119,11 @@ struct True16D16Info {
unsigned LoOp;
};
+struct WMMAInstInfo {
+ uint16_t Opcode;
+ bool is_wmma_xdl;
+};
+
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
@@ -129,6 +134,7 @@ struct True16D16Info {
#define GET_isMFMA_F8F6F4Table_DECL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
#define GET_True16D16Table_DECL
+#define GET_WMMAInstInfoTable_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
@@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc);
LLVM_READONLY
bool getMAIIsGFX940XDL(unsigned Opc);
+LLVM_READONLY
+bool getWMMAIsXDL(unsigned Opc);
+
// Get an equivalent BitOp3 for a binary logical \p Opc.
// \returns BitOp3 modifier for the logical operation or zero.
// Used in VOPD3 conversion.
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 211112e5262a..f621f8581f77 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+
+let SubtargetPredicate = HasTanhInsts in
+defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
@@ -526,6 +529,21 @@ defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>;
defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+
+let SubtargetPredicate = HasTanhInsts in {
+defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>;
+}
+
+let SubtargetPredicate = HasBF16TransInsts in {
+defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
+defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
+defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
+defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
+defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
+defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>;
+defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>;
+defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>;
+}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
defm V_FREXP_EXP_I16_F16 : VOP1Inst_t16_with_profiles <"v_frexp_exp_i16_f16",
@@ -785,6 +803,9 @@ let SubtargetPredicate = isGFX1250Plus in {
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>;
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_bf8, V_CVT_F16_BF8_fake16_e64, 1>;
}
+
+ defm V_SAT_PK4_I4_I8 : VOP1Inst_t16<"v_sat_pk4_i4_i8", VOP1_I16_I32, int_amdgcn_sat_pk4_i4_i8>;
+ defm V_SAT_PK4_U4_U8 : VOP1Inst_t16<"v_sat_pk4_u4_u8", VOP1_I16_I32, int_amdgcn_sat_pk4_u4_u8>;
} // End SubtargetPredicate = isGFX1250Plus
let SubtargetPredicate = isGFX10Plus in {
@@ -1062,6 +1083,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
}
+multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ def _e64_gfx1250 :
+ VOP3_Real_Gen<ps, GFX1250Gen>,
+ VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
+}
+
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
@@ -1127,11 +1155,25 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
+defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
+defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
+defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
+defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
+defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
+defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
+defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
+defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
+defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
+defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
+defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;
+defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>;
+defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 25c6cbc3e1ab..030a6e1e978c 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -175,10 +175,14 @@ multiclass VOP2Inst_e64<string opName,
def _e64 : VOP3InstBase <opName, P, node, 1>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
- let SubtargetPredicate = isGFX11Plus in {
- if P.HasExtVOP3DPP then
- def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
- } // End SubtargetPredicate = isGFX11Plus
+ if P.HasExtVOP3DPP then
+ def _e64_dpp : VOP3_DPP_Pseudo <opName, P> {
+ let SubtargetPredicate = isGFX11Plus;
+ }
+ else if P.HasExt64BitDPP then
+ def _e64_dpp : VOP3_DPP_Pseudo <opName, P> {
+ let OtherPredicates = [HasDPALU_DPP];
+ }
}
multiclass VOP2Inst_e64_VOPD<string opName,
@@ -1492,7 +1496,9 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
VOP2_DPP<op, ps, opName, p, 1> {
let AssemblerPredicate = HasDPP16;
let SubtargetPredicate = ps.SubtargetPredicate;
- let OtherPredicates = ps.OtherPredicates;
+ let OtherPredicates = !listconcat(ps.OtherPredicates,
+ !if(p.HasExt64BitDPP, [HasDPALU_DPP], []),
+ !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []));
}
class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget,
@@ -1832,6 +1838,9 @@ let SubtargetPredicate = isGFX12Plus in {
V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx12, "v_subrev_co_ci_u32">;
} // End SubtargetPredicate = isGFX12Plus
+let SubtargetPredicate = HasFmacF64Inst in
+defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
+
defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 75c531913ded..2e7f25b67fb6 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -224,12 +224,6 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32",
fshr, null_frag>;
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
-
-// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32.
-// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored.
-defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
-defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
-
let True16Predicate = UseRealTrue16Insts in
defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>;
let True16Predicate = UseFakeTrue16Insts in
@@ -1960,9 +1954,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
-defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
-defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
-
defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
@@ -2113,8 +2104,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>;
defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>;
defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>;
defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>;
-defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>;
-defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>;
+defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>;
defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>;
defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>;
defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>;
@@ -2257,17 +2248,6 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
}
}
-// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi.
-// The following is created to support that.
-multiclass VOP3OpSel_Real_gfx9_with_name<bits<10> op, string opName, string AsmName> {
- defvar psName = opName#"_e64";
- def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(psName), SIEncodingFamily.VI>, // note: encoding family is VI
- VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(psName).Pfl> {
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(psName);
- let AsmString = AsmName # ps.AsmOperands;
- }
-}
-
} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
@@ -2287,10 +2267,8 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>;
defm V_FMA_F32 : VOP3_Real_vi <0x1cb>;
defm V_FMA_F64 : VOP3_Real_vi <0x1cc>;
defm V_LERP_U8 : VOP3_Real_vi <0x1cd>;
-let SubtargetPredicate = isGFX8Only in {
defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>;
defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>;
-}
defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>;
defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>;
defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>;
@@ -2335,9 +2313,6 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16"
defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">;
defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">;
-defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
-defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
-
defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 31997f803dfc..e51e9574f8de 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1223,6 +1223,8 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
Instruction Opcode2Addr = TwoAddr;
Instruction Opcode3Addr = ThreeAddr;
Predicate WaveSizePredicate;
+ Predicate SubtargetPredicate;
+ field bit is_wmma_xdl;
}
def WMMAOpcode : GenericEnum {
@@ -1315,28 +1317,39 @@ let WaveSizePredicate = isWave64 in {
}
class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
- bit _IsIU, bit _IsFP8BF8>
+ bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
+ bit _HasMatrixReuse = 0, bit _IsF4 = 0>
: VOP3P_Profile<VOPProfile<ArgTy>> {
bit IsIU = _IsIU;
- bit IsFP8BF8 = _IsFP8BF8;
- bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8));
+ bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
+ bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
int IndexType = _IndexType;
+ let HasMatrixReuse = _HasMatrixReuse;
+ bit HasIModOp = _Has_ImodOp;
+ let HasClamp = !and(IsIU, !not(HasIModOp));
let IsPacked = 1;
let IsWMMA = !not(_IsSWMMAC);
let IsSWMMAC = _IsSWMMAC;
- bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP);
- bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret);
+ bit IsAB_F64 = !or(!eq(ArgTy[1], v2f64), !eq(ArgTy[1], v4f64));
+ bit IsAB_F32 = !eq(ArgTy[1], v2f32);
+ bit IsAB_F16 = !or(!eq(ArgTy[1], v16f16), !eq(ArgTy[1], v8f16), !eq(ArgTy[1], v4f16));
+ bit IsAB_BF16 = !or(!eq(ArgTy[1], v16i16), !eq(ArgTy[1], v8i16), !eq(ArgTy[1], v4i16),
+ !eq(ArgTy[1], v16bf16), !eq(ArgTy[1], v8bf16), !eq(ArgTy[1], v4bf16));
+ bit IsF16BF16 = !or(IsAB_F16, IsAB_BF16);
+
+ bit IsC_F64 = !eq(ArgTy[3], v8f64);
bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
- bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16));
+ bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16),
+ !eq(ArgTy[3], v8bf16), !eq(ArgTy[3], v4bf16));
bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
- bit NegLo01 = !or(IsF16BF16, IsIU);
- bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
- bit NegHi01 = IsF16BF16;
- bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+ bit NegLo01 = !not(NoABMods);
+ bit NegLo2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
+ bit NegHi01 = IsF16BF16; // Only F16BF16 can have neg_hi[0:1]
+ bit NegHi2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
bit NegLoAny = !or(NegLo01, NegLo2);
bit NegHiAny = !or(NegHi01, NegHi2);
@@ -1345,19 +1358,29 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
let Src1RC64 = !cast<RegisterOperand>("VRegSrc_"#ArgTy[2].Size);
let Src2RC64 = !if(IsSWMMAC, DstRC,
!cast<RegisterOperand>("VISrc_"#ArgTy[3].Size#
- !cond(IsC_F32: "_f32",
- IsC_F16: "_f16",
+ !cond(IsC_F64: "_f64",
+ IsC_F32: "_f32",
+ IsC_F16: "_f16",
IsC_BF16: "_bf16",
1: "_b32")));
// For f16 and bf16 matrices A and B, each element can be modified by
- // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is
+ // fneg(neg_lo,neg_hi = 1). For f32 and f64, neg_lo[0:1] is allowed, but
+ // neg_hi[0:1] is ignored. For iu4 and iu8 matrices A and B neg_lo is
// overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext)
- // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each
- // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
+ // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16, f32 and f64 matrix C
+ // each element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
// Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index
// ---------------------------------------------------------------------------
+ // wmma f64_f64 | neg_lo for neg A/B | neg_lo = 1 neg C(f64)
+ // | neg_hi ignored | neg_hi = 1 abs C(f64)
+ // ---------------------------------------------------------------------------
+ // wmma f32_f32 | neg_lo for neg A/B | neg_lo = 1 neg C(f32)
+ // | neg_hi ignored | neg_hi = 1 abs C(f32)
+ // ---------------------------------------------------------------------------
+ // wmma f32_xf32 | not allowed for xf32 | not allowed
+ // ---------------------------------------------------------------------------
// wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32)
// wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32)
// ---------------------------------------------------------------------------
@@ -1368,7 +1391,10 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
// | neg_lo = 1 i4/i8(sext) | i32 matrices
// ---------------------------------------------------------------------------
// wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32)
- // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32)
+ // | fp8 and bf8 matrices | neg_hi = 1 abs C(f32)
+ // ---------------------------------------------------------------------------
+ // wmma f16_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f16)
+ // | fp8 and bf8 matrices | neg_hi = 1 abs C(f16)
// ---------------------------------------------------------------------------
// swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
// swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
@@ -1380,103 +1406,153 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
// | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst
// ---------------------------------------------------------------------------
// swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix
- // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst
+ // swmmac f16_fp8/bf8 | f8 and bf8 matrices | A Index - matrix C is in dst
+ // ---------------------------------------------------------------------------
// pseudo
- // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
+ // fp8bf8 and xf32 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
// use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
// remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
// f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
-
- dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers));
- dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers));
- dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers));
+ dag Src0Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src0_modifiers));
+ dag Src1Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src1_modifiers));
+ dag Src2Mods = !if(!or(IsIU, IsXF32, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers));
dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
!eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
- !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit));
- dag Clamp = !if(IsIU, (ins Clamp0:$clamp), (ins));
+ !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
+ !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit));
+
+ dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
+ dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
!and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo),
!and(!not(NegLoAny), !not(NegHiAny)) : (ins));
let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1),
!cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)),
- IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)),
- Clamp, Neg);
+ IsSWMMAC : !con((ins DstRC:$srcTiedDef),
+ !if(!eq(IndexType, 32),
+ (ins VRegSrc_64:$src2),
+ (ins VRegSrc_32:$src2)),
+ IndexKey)),
+ MatrixReuse, Clamp, Neg);
// asm
string IndexKeyAsm = !cond(!eq(IndexType, 0) : "",
!eq(IndexType, 8) : "$index_key_8bit",
- !eq(IndexType, 16) : "$index_key_16bit");
- string ClampAsm = !if(IsIU, "$clamp", "");
+ !eq(IndexType, 16) : "$index_key_16bit",
+ !eq(IndexType, 32) : "$index_key_32bit");
+ string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", "");
+ string ClampAsm = !if(HasClamp, "$clamp", "");
string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
!and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
!and(!not(NegLoAny), !not(NegHiAny)) : "");
- let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm;
+ let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm;
// isel patterns
+ bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
+ bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp));
+ bit IsAB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp);
+ bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp);
+ dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+ IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src0_modifiers), Src0VT:$src0),
+ IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
+ IsAB_BF16_IMod0 : (ins Src0VT:$src0),
+ IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+ NoABMods : (ins Src0VT:$src0));
+ dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsAB_F16_IMod0 : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsAB_BF16_IMod0 : (ins (i32 8), Src0VT:$src0),
+ IsIU : (ins i32:$src0_modifiers, Src0VT:$src0),
+ NoABMods : (ins Src0VT:$src0));
+ dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+ IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src1_modifiers), Src1VT:$src1),
+ IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
+ IsAB_BF16_IMod0 : (ins Src1VT:$src1),
+ IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+ NoABMods : (ins Src1VT:$src1));
+ dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsAB_F16_IMod0 : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1),
+ IsIU : (ins i32:$src1_modifiers, Src1VT:$src1),
+ NoABMods : (ins Src1VT:$src1));
+ bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32));
+ bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp));
+ bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp));
+ bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp));
+ bit IsIUXF32 = !or(IsIU, IsXF32);
+ dag Src2InPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs i32:$src2_modifiers), Src2VT:$src2),
+ IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+ IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+ IsC_BF16_IMod0 : (ins Src2VT:$src2),
+ IsIUXF32 : (ins Src2VT:$src2),
+ IsSWMMAC : (ins));
+ dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2),
+ IsIUXF32 : (ins Src2VT:$src2),
+ IsSWMMAC : (ins));
+ dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
- dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
- IsAB_BF16 : (ins Src0VT:$src0),
- IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
- IsFP8BF8 : (ins Src0VT:$src0));
- dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0),
- IsAB_BF16 : (ins (i32 8), Src0VT:$src0),
- IsIU : (ins i32:$src0_modifiers, Src0VT:$src0),
- IsFP8BF8 : (ins Src0VT:$src0));
- dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
- IsAB_BF16 : (ins Src1VT:$src1),
- IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
- IsFP8BF8 : (ins Src1VT:$src1));
- dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1),
- IsAB_BF16 : (ins (i32 8), Src1VT:$src1),
- IsIU : (ins i32:$src1_modifiers, Src1VT:$src1),
- IsFP8BF8 : (ins Src1VT:$src1));
- dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
- IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
- IsC_BF16 : (ins Src2VT:$src2),
- IsIU : (ins Src2VT:$src2),
- IsSWMMAC : (ins));
- dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2),
- IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2),
- IsC_BF16 : (ins (i32 8), Src2VT:$src2),
- IsIU : (ins Src2VT:$src2),
- IsSWMMAC : (ins));
- dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins));
dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
!eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
- !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))));
+ !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))),
+ !eq(IndexType, 32): (ins (i64 (SWMMACIndex32 i64:$src2, i32:$index_key_32bit))));
dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
!eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
- !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit));
- dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2)));
- dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2));
+ !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit),
+ !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
+ dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
+ dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2));
+ dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins));
+ dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
- dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat);
- dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat);
+ dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat);
+ dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat);
- dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat);
- dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat);
+ dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
+ dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
// wmma pattern where src2 is inline imm uses _threeaddr pseudo,
// can't use _twoaddr since it would violate src2 tied to vdst constraint.
- dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat);
- dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat);
+ dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat);
+ dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat);
}
-multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+def WMMAInstInfoTable : GenericTable {
+ let FilterClass = "WMMAInstInfo";
+ let CppTypeName = "WMMAInstInfo";
+ let Fields = ["Opcode", "is_wmma_xdl"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getWMMAInstInfoHelper";
+}
+
+class WMMAInstInfo {
+ Instruction Opcode = !cast<Instruction>(NAME);
+ bit is_wmma_xdl = 0;
+}
+
+multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix, bit DiffVdstSrc2 = 0> {
+
+ defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
+ defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
+
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
- let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in
- def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
+ def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
}
- let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in
- def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in
+ def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
}
@@ -1486,7 +1562,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
}
multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
- def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let Mnemonic = Instr;
let PseudoInstr = Instr#PseudoInstrSuffix;
let mayRaiseFPException = 0;
@@ -1556,6 +1632,76 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1,
// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
// for matrix A, index is i16; Matrix B uses all lanes
+def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>;
+def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>;
+def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
+def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>;
+def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>;
+def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>;
+def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
+def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>;
+def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>;
+def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>;
+def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>;
+def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>;
+def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>;
+def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>;
+def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>;
+def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>;
+def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>;
+def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 1>;
+def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 1>;
+def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 1>;
+
+let WaveSizePredicate = isWave32 in {
+let SubtargetPredicate = isGFX125xOnly in {
+defm V_WMMA_F32_16X16X4_F32_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x4_f32", F32_F32_WMMA_w32, "_w32">;
+
+let is_wmma_xdl = 1 in {
+defm V_WMMA_F32_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x32_bf16", F32_BF16X32_WMMA_w32, "_w32">;
+defm V_WMMA_BF16_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x32_bf16", BF16_BF16X32_WMMA_w32, "_w32">;
+defm V_WMMA_BF16F32_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16f32_16x16x32_bf16", BF16F32_BF16_WMMA_w32, "_w32", 1>;
+defm V_WMMA_F32_16X16X64_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_fp8", F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X64_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_bf8", F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X64_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_fp8", F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X64_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_bf8", F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_fp8", F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_bf8", F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_fp8", F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_bf8", F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X64_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x64_iu8", I32_IU8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X32_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x32_f16", F32_F16X32_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X32_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x32_f16", F16_F16X32_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_f32_32x16x128_f4", F32_32X16X128_F4_WMMA_w32, "_w32">;
+
+defm V_SWMMAC_F32_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_bf16", F32_BF16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x64_bf16", BF16_BF16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16f32_16x16x64_bf16", F32_BF16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X128_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x128_iu8", I32_IU8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16", F32_F16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">;
+
+} // End is_wmma_xdl = 1.
+
+} // End SubtargetPredicate = isGFX125xOnly
+} // End WaveSizePredicate = isWave32
+
let WaveSizePredicate = isWave32 in {
defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">;
defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">;
@@ -1628,7 +1774,7 @@ class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile
let WaveSizePredicate = isWave64;
}
-let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in {
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>;
@@ -1655,7 +1801,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
}
-let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
+let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>;
@@ -1681,6 +1827,49 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>;
}
+let WaveSizePredicate = isWave32 in {
+let SubtargetPredicate = isGFX125xOnly in {
+ defm : WMMAPat<"V_WMMA_F32_16X16X4_F32_w32", int_amdgcn_wmma_f32_16x16x4_f32, F32_F32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X32_BF16_w32", int_amdgcn_wmma_f32_16x16x32_bf16, F32_BF16X32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_BF16_16X16X32_BF16_w32", int_amdgcn_wmma_bf16_16x16x32_bf16, BF16_BF16X32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_BF16F32_16X16X32_BF16_w32", int_amdgcn_wmma_bf16f32_16x16x32_bf16, BF16F32_BF16_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_fp8, F32_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_bf8, F32_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_fp8, F32_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_bf8, F32_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_fp8, F16_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_bf8, F16_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_fp8, F16_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_bf8, F16_FP8BF8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X64_IU8_w32", int_amdgcn_wmma_i32_16x16x64_iu8, I32_IU8X64_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X32_F16_w32", int_amdgcn_wmma_f32_16x16x32_f16, F32_F16X32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X32_F16_w32", int_amdgcn_wmma_f16_16x16x32_f16, F16_F16X32_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
+
+ def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x64_bf16, BF16_BF16X64_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x128_iu8, I32_IU8X128_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X64_F16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_f16, F32_F16X64_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X64_F16_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x64_f16, F16_F16X64_SWMMAC_w32>;
+} // End SubtargetPredicate = isGFX125xOnly
+} // End WaveSizePredicate = isWave32
//===----------------------------------------------------------------------===//
// Begin Real Encodings
@@ -1726,13 +1915,14 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
// opsel
let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
!eq(WMMAP.IndexType, 8) : index_key_8bit{0},
- !eq(WMMAP.IndexType, 16) : index_key_16bit{0});
+ !eq(WMMAP.IndexType, 16) : index_key_16bit{0},
+ !eq(WMMAP.IndexType, 32) : index_key_32bit{0});
let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
- let Inst{13} = 0;
+ let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0);
// opsel_hi
let Inst{59} = 1;
let Inst{60} = 1;
- let Inst{14} = 1;
+ let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1);
// neg_lo
let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
@@ -1742,7 +1932,7 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
// clamp
- let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0);
+ let Inst{15} = !if(WMMAP.HasClamp, clamp{0}, 0);
}
multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<8> op, VOP3PWMMA_Profile WMMAP,
@@ -1765,6 +1955,12 @@ multiclass VOP3P_Real_WMMA_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
}
}
+multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
+ let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
+ defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1250Gen, op, WMMAP>;
+ }
+}
+
defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
@@ -1814,6 +2010,46 @@ defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP
defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
+defm V_WMMA_F32_16X16X4_F32_w32 : VOP3P_Real_WMMA_gfx1250 <0x05d, F32_F32_WMMA_w32>;
+defm V_WMMA_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x062, F32_BF16X32_WMMA_w32>;
+defm V_WMMA_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x060, F32_F16X32_WMMA_w32>;
+defm V_WMMA_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x061, F16_F16X32_WMMA_w32>;
+defm V_WMMA_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x063, BF16_BF16X32_WMMA_w32>;
+defm V_WMMA_BF16F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x064, BF16F32_BF16_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06a, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06b, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06c, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06d, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06e, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06f, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x070, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x071, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_I32_16X16X64_IU8_w32 : VOP3P_Real_WMMA_gfx1250 <0x072, I32_IU8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x080, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x081, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x082, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x083, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x084, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x085, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
+
+defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
+defm V_SWMMAC_BF16_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x068, BF16_BF16X64_SWMMAC_w32>;
+defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x069, F32_BF16X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x073, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x074, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x075, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x076, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x077, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x078, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x079, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x07a, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X128_IU8_w32 : VOP3P_Real_WMMA_gfx1250 <0x07b, I32_IU8X128_SWMMAC_w32>;
+
multiclass VOP3P_Real_with_name<GFXGen Gen, bits<8> op,
string backing_ps_name = NAME,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index df215d23f7f4..a25ebdf3e5f6 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
// Special case for v_permlane16_swap_b32/v_permlane32_swap_b32
// op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands.
-class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
+class VOP3OpSelIsDPP_base {
bits<1> fi;
bits<1> bound_ctrl;
+}
+
+class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> {
+ // OPSEL[0] specifies FI
+ let Inst{11} = fi;
+ // OPSEL[1] specifies BOUND_CTRL
+ let Inst{12} = bound_ctrl;
+}
+class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> {
// OPSEL[0] specifies FI
let Inst{11} = fi;
// OPSEL[1] specifies BOUND_CTRL
@@ -432,7 +441,7 @@ class VOP3be <VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
}
-class VOP3Pe <VOPProfile P> : Enc64 {
+class VOP3Pe_Base {
bits<8> vdst;
bits<4> src0_modifiers;
bits<9> src0;
@@ -443,7 +452,12 @@ class VOP3Pe <VOPProfile P> : Enc64 {
bits<1> clamp;
bits<2> index_key_8bit;
bits<1> index_key_16bit;
+ bits<1> index_key_32bit;
+ bits<1> matrix_a_reuse;
+ bits<1> matrix_b_reuse;
+}
+class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base {
let Inst{7-0} = !if(P.HasDst, vdst, 0);
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
@@ -451,9 +465,13 @@ class VOP3Pe <VOPProfile P> : Enc64 {
let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
- let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
+ let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2},
+ !if(P.HasMatrixReuse, matrix_a_reuse, 0)); // op_sel(2)
- let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2)
+ let Inst{14} = !cond(!and(P.HasSrc2, P.HasOpSel) : src2_modifiers{3},
+ P.IsDOT : 1,
+ P.HasMatrixReuse : matrix_b_reuse,
+ 1: ?); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 850b00406f09..1c42f44765ab 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -2041,12 +2041,6 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
}
break;
}
- case ARM::TRAPNaCl: {
- uint32_t Val = 0xe7fedef0UL;
- OutStreamer->AddComment("trap");
- ATS.emitInst(Val);
- return;
- }
case ARM::tTRAP: {
// Non-Darwin binutils don't yet support the "trap" mnemonic.
// FIXME: Remove this special case when they do.
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 52302241fe36..57141ab69223 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2542,9 +2542,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
}
case ARM::Int_eh_sjlj_dispatchsetup: {
MachineFunction &MF = *MI.getParent()->getParent();
- const ARMBaseInstrInfo *AII =
- static_cast<const ARMBaseInstrInfo*>(TII);
- const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+ const ARMBaseRegisterInfo &RI = TII->getRegisterInfo();
// For functions using a base pointer, we rematerialize it (via the frame
// pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it
// for us. Otherwise, expand to nothing.
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 06499a3945ee..7ba2487d2390 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -2562,8 +2562,7 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass
: &ARM::GPRRegClass;
- const ARMBaseRegisterInfo *RegInfo =
- static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo());
+ const ARMBaseRegisterInfo *RegInfo = Subtarget->getRegisterInfo();
Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
Register SrcReg = FramePtr;
@@ -2636,12 +2635,8 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
return SelectCall(&I, "memset");
}
case Intrinsic::trap: {
- unsigned Opcode;
- if (Subtarget->isThumb())
- Opcode = ARM::tTRAP;
- else
- Opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opcode));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+ TII.get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
return true;
}
}
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index bb437698296c..9b1fa5d7b99d 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -451,12 +451,6 @@ def FeatureVirtualization : SubtargetFeature<"virtualization",
"Supports Virtualization extension",
[FeatureHWDivThumb, FeatureHWDivARM]>;
-// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
-// See ARMInstrInfo.td for details.
-// True if NaCl TRAP instruction is generated instead of the regular TRAP.
-def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
- "NaCl trap">;
-
// True if the subtarget disallows unaligned memory
// accesses for some types. For details, see
// ARMTargetLowering::allowsMisalignedMemoryAccesses().
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 50d8eee8644c..a8da70eadea5 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1747,9 +1747,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
RetOpcode == ARM::TCRETURNrinotr12);
isInterrupt =
RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
- isTrap =
- RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl ||
- RetOpcode == ARM::tTRAP;
+ isTrap = RetOpcode == ARM::TRAP || RetOpcode == ARM::tTRAP;
isCmseEntry = (RetOpcode == ARM::tBXNS || RetOpcode == ARM::tBXNS_RET);
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fb72bab03e75..fd3b0525c105 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3545,8 +3545,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
auto T = const_cast<Type*>(CP->getType());
auto C = const_cast<Constant*>(CP->getConstVal());
- auto M = const_cast<Module*>(DAG.getMachineFunction().
- getFunction().getParent());
+ auto M = DAG.getMachineFunction().getFunction().getParent();
auto GV = new GlobalVariable(
*M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
@@ -11040,13 +11039,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
DispatchBB->setIsEHPad();
MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
- unsigned trap_opcode;
- if (Subtarget->isThumb())
- trap_opcode = ARM::tTRAP;
- else
- trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
- BuildMI(TrapBB, dl, TII->get(trap_opcode));
+ BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
DispatchBB->addSuccessor(TrapBB);
MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
@@ -21590,7 +21584,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -21598,6 +21592,11 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef55b22c..9159f3d2c3ed 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -681,7 +681,7 @@ class VectorType;
unsigned getMaxSupportedInterleaveFactor() const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 1f5ba998970f..934ec52c6f1e 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -486,11 +486,6 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
return hasNoVMLxHazardUse(N);
}]>;
-// An 'fadd' node which can be contracted into a fma
-def fadd_contract : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
- return N->getFlags().hasAllowContract();
-}]>;
-
def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
@@ -2387,29 +2382,13 @@ def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary,
/*
* A5.4 Permanently UNDEFINED instructions.
*
- * For most targets use UDF #65006, for which the OS will generate SIGTRAP.
- * Other UDF encodings generate SIGILL.
+ * Targets use UDF #65006, for which the OS will generate SIGTRAP.
*
- * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb.
- * Encoding A1:
- * 1110 0111 1111 iiii iiii iiii 1111 iiii
- * Encoding T1:
- * 1101 1110 iiii iiii
- * It uses the following encoding:
- * 1110 0111 1111 1110 1101 1110 1111 0000
- * - In ARM: UDF #60896;
- * - In Thumb: UDF #254 followed by a branch-to-self.
*/
let isTrap = 1 in
-def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary,
- "trap", [(trap)]>,
- Requires<[IsARM,UseNaClTrap]> {
- let Inst = 0xe7fedef0;
-}
-let isTrap = 1 in
def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
"trap", [(trap)]>,
- Requires<[IsARM,DontUseNaClTrap]> {
+ Requires<[IsARM]> {
let Inst = 0xe7ffdefe;
}
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 06f362b26744..b84f685f214c 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1293,7 +1293,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
RDA = &getAnalysis<ReachingDefAnalysis>();
MF->getProperties().setTracksLiveness();
MRI = &MF->getRegInfo();
- TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
+ TII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
BBUtils = std::make_unique<ARMBasicBlockUtils>(*MF);
BBUtils->computeAllBlockSizes();
diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td
index ddc5ad8754ee..c638e96a355d 100644
--- a/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/llvm/lib/Target/ARM/ARMPredicates.td
@@ -167,16 +167,12 @@ def IsARM : Predicate<"!Subtarget->isThumb()">,
AssemblerPredicate<(all_of (not ModeThumb)), "arm-mode">;
def IsMachO : Predicate<"Subtarget->isTargetMachO()">;
def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">;
-def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">;
def IsReadTPTPIDRURW : Predicate<"Subtarget->isReadTPTPIDRURW()">;
def IsReadTPTPIDRURO : Predicate<"Subtarget->isReadTPTPIDRURO()">;
def IsReadTPTPIDRPRW : Predicate<"Subtarget->isReadTPTPIDRPRW()">;
def IsReadTPSoft : Predicate<"Subtarget->isReadTPSoft()">;
-def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">,
- AssemblerPredicate<(all_of FeatureNaClTrap), "NaCl">;
-def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">;
def UseNegativeImmediates :
Predicate<"false">,
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 13185a7d797a..9f600e0c685a 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -189,7 +189,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (TM.isAAPCS_ABI())
stackAlignment = Align(8);
- if (isTargetNaCl() || TM.isAAPCS16_ABI())
+ if (TM.isAAPCS16_ABI())
stackAlignment = Align(16);
// FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
@@ -407,10 +407,9 @@ bool ARMSubtarget::useFastISel() const {
if (!hasV6Ops())
return false;
- // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
- return TM.Options.EnableFastISel &&
- ((isTargetMachO() && !isThumb1Only()) ||
- (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
+ // Thumb2 support on iOS; ARM support on iOS and Linux.
+ return TM.Options.EnableFastISel && ((isTargetMachO() && !isThumb1Only()) ||
+ (isTargetLinux() && !isThumb()));
}
unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index beb1ff644714..637eb4560e0f 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -338,7 +338,6 @@ public:
bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); }
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
- bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index c66232ef4dc7..e8d0d3508077 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -166,9 +166,8 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
// Integer registers are 32 bits.
Ret += "-n32";
- // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
- // aligned everywhere else.
- if (TT.isOSNaCl() || ABI == ARM::ARM_ABI_AAPCS16)
+ // The stack is 64 bit aligned on AAPCS and 32 bit aligned everywhere else.
+ if (ABI == ARM::ARM_ABI_AAPCS16)
Ret += "-S128";
else if (ABI == ARM::ARM_ABI_AAPCS)
Ret += "-S64";
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index cf84f1043cc6..3692eeeaaa64 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -16,7 +16,6 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/MCValue.h"
#include "llvm/MC/SectionKind.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 05d4069a686a..6f37eca2b00a 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1330,8 +1330,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
LT.second))
- return LT.first * Entry->Cost *
- ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
+ return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
}
if (!Mask.empty()) {
@@ -1340,7 +1339,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
Mask.size() <= LT.second.getVectorNumElements() &&
(isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
isVREVMask(Mask, LT.second, 64)))
- return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
+ return ST->getMVEVectorCostFactor(CostKind) * LT.first;
}
}
@@ -1348,7 +1347,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
if (IsExtractSubvector)
Kind = TTI::SK_ExtractSubvector;
int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
- ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
+ ? ST->getMVEVectorCostFactor(CostKind)
: 1;
return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
Index, SubTp);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index ca06b9e3cb66..522c235a90a8 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -91,9 +91,9 @@ class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack,
ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP,
ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass,
- ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign,
- ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9,
- ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates
+ ARM::FeatureAClass, ARM::FeatureStrictAlign, ARM::FeatureLongCalls,
+ ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt,
+ ARM::FeatureNoNegativeImmediates
};
const ARMSubtarget *getST() const { return ST; }
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 5f930fb0c807..2e47ceeca96b 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -877,8 +877,7 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
/// is an address into a section with 'C' string literals.
static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
const MCDisassembler *Decoder) {
- const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
- Dis->tryAddingPcLoadReferenceComment(Value, Address);
+ Decoder->tryAddingPcLoadReferenceComment(Value, Address);
}
// Thumb1 instructions don't have explicit S bits. Rather, they
@@ -1482,7 +1481,7 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
DecodeStatus S = MCDisassembler::Success;
const FeatureBitset &featureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
if ((RegNo == 13 && !featureBits[ARM::HasV8Ops]) || RegNo == 15)
S = MCDisassembler::SoftFail;
@@ -1535,7 +1534,7 @@ static bool PermitsD32(const MCInst &Inst, const MCDisassembler *Decoder) {
if (Inst.getOpcode() == ARM::VSCCLRMD || Inst.getOpcode() == ARM::VSCCLRMS)
return true;
const FeatureBitset &featureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
return featureBits[ARM::FeatureD32];
}
@@ -1879,7 +1878,7 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
unsigned Rn = fieldFromInstruction(Insn, 16, 4);
unsigned U = fieldFromInstruction(Insn, 23, 1);
const FeatureBitset &featureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
switch (Inst.getOpcode()) {
case ARM::LDC_OFFSET:
@@ -2553,8 +2552,8 @@ static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
const MCDisassembler *Decoder) {
unsigned pred = fieldFromInstruction(Insn, 28, 4);
unsigned imm8 = fieldFromInstruction(Insn, 0, 8);
- const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
- const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits();
+ const FeatureBitset &FeatureBits =
+ Decoder->getSubtargetInfo().getFeatureBits();
DecodeStatus S = MCDisassembler::Success;
@@ -2798,8 +2797,8 @@ static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
unsigned Imm = fieldFromInstruction(Insn, 9, 1);
- const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
- const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits();
+ const FeatureBitset &FeatureBits =
+ Decoder->getSubtargetInfo().getFeatureBits();
if (!FeatureBits[ARM::HasV8_1aOps] ||
!FeatureBits[ARM::HasV8Ops])
@@ -4081,7 +4080,7 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
unsigned Rn = fieldFromInstruction(Insn, 16, 4);
const FeatureBitset &featureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
bool hasMP = featureBits[ARM::FeatureMP];
bool hasV7Ops = featureBits[ARM::HasV7Ops];
@@ -4170,7 +4169,7 @@ static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
unsigned add = fieldFromInstruction(Insn, 9, 1);
const FeatureBitset &featureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
bool hasMP = featureBits[ARM::FeatureMP];
bool hasV7Ops = featureBits[ARM::HasV7Ops];
@@ -4252,7 +4251,7 @@ static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
imm |= (Rn << 13);
const FeatureBitset &featureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
bool hasMP = featureBits[ARM::FeatureMP];
bool hasV7Ops = featureBits[ARM::HasV7Ops];
@@ -4371,7 +4370,7 @@ static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
int imm = fieldFromInstruction(Insn, 0, 12);
const FeatureBitset &featureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
bool hasV7Ops = featureBits[ARM::HasV7Ops];
@@ -4826,7 +4825,7 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
return MCDisassembler::Fail;
const FeatureBitset &featureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
if (!isValidCoprocessorNumber(Val, featureBits))
return MCDisassembler::Fail;
@@ -4839,7 +4838,7 @@ static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
const FeatureBitset &FeatureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
DecodeStatus S = MCDisassembler::Success;
unsigned Rn = fieldFromInstruction(Insn, 16, 4);
@@ -4984,7 +4983,7 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val, uint64_t Address,
const MCDisassembler *Decoder) {
DecodeStatus S = MCDisassembler::Success;
const FeatureBitset &FeatureBits =
- ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
if (FeatureBits[ARM::FeatureMClass]) {
unsigned ValLow = Val & 0xff;
@@ -6019,7 +6018,7 @@ static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, uint64_t Address,
static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address,
const MCDisassembler *Decoder) {
const FeatureBitset &featureBits =
- ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
@@ -6078,7 +6077,7 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address,
static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address,
const MCDisassembler *Decoder) {
const FeatureBitset &featureBits =
- ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
@@ -6244,7 +6243,7 @@ static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val,
uint64_t Address,
const MCDisassembler *Decoder) {
const FeatureBitset &featureBits =
- ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
DecodeStatus S = MCDisassembler::Success;
// Add explicit operand for the destination sysreg, for cases where
@@ -6717,7 +6716,7 @@ static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Val,
case ARM::VLDR_FPSCR_post:
case ARM::VLDR_FPSCR_NZCVQC_post:
const FeatureBitset &featureBits =
- ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+ Decoder->getSubtargetInfo().getFeatureBits();
if (!featureBits[ARM::HasMVEIntegerOps] && !featureBits[ARM::FeatureVFP2])
return MCDisassembler::Fail;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 376bddb120d5..146fc6704c6d 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -215,7 +215,7 @@ static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) {
const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
uint64_t Value) const {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case ARM::fixup_arm_thumb_br: {
// Relaxing tB to t2B. tB has a signed 12-bit displacement with the
// low bit being an implied zero. There's an implied +4 offset for the
@@ -311,12 +311,13 @@ static bool needsInterworking(const MCAssembler &Asm, const MCSymbol *Sym,
return false;
}
-bool ARMAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+bool ARMAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &,
+ const MCFixup &Fixup,
const MCValue &Target,
uint64_t Value,
bool Resolved) const {
const MCSymbol *Sym = Target.getAddSym();
- if (needsInterworking(*Asm, Sym, Fixup.getTargetKind()))
+ if (needsInterworking(*Asm, Sym, Fixup.getKind()))
return true;
if (!Resolved)
@@ -947,7 +948,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
}
// Create relocations for unconditional branches to function symbols with
// different execution mode in ELF binaries.
- if (needsInterworking(*Asm, Sym, Fixup.getTargetKind()))
+ if (needsInterworking(*Asm, Sym, Fixup.getKind()))
return true;
// We must always generate a relocation for BL/BLX instructions if we have
// a symbol to reference, as the linker relies on knowing the destination
@@ -1093,7 +1094,7 @@ std::optional<bool> ARMAsmBackend::evaluateFixup(const MCFragment &F,
// For a few PC-relative fixups in Thumb mode, offsets need to be aligned
// down. We compensate here because the default handler's `Value` decrement
// doesn't account for this alignment.
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case ARM::fixup_t2_ldst_pcrel_12:
case ARM::fixup_t2_pcrel_10:
case ARM::fixup_t2_pcrel_9:
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 877e3afdb1d5..07d2cf784c44 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -51,7 +51,8 @@ public:
const char *reasonForFixupRelaxation(const MCFixup &Fixup,
uint64_t Value) const;
- bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t,
+ bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &,
+ const MCValue &, uint64_t,
bool) const override;
void relaxInstruction(MCInst &Inst,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index b0ebb74424c7..50e9ca1d3759 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -75,7 +75,7 @@ bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCValue &V,
unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
const MCValue &Target,
bool IsPCRel) const {
- unsigned Kind = Fixup.getTargetKind();
+ auto Kind = Fixup.getKind();
uint8_t Specifier = Target.getSpecifier();
auto CheckFDPIC = [&](uint32_t Type) {
if (getOSABI() != ELF::ELFOSABI_ARM_FDPIC)
@@ -105,7 +105,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
}
if (IsPCRel) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
reportError(Fixup.getLoc(), "unsupported relocation type");
return ELF::R_ARM_NONE;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index c61e405bd3a0..eaba6fe5bfcb 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -638,7 +638,7 @@ private:
Offset = 0;
}
bool hasInfo() { return F != nullptr; }
- MCDataFragment *F = nullptr;
+ MCFragment *F = nullptr;
uint64_t Offset = 0;
ElfMappingSymbol State = EMS_None;
};
@@ -650,11 +650,11 @@ private:
// This is a tentative symbol, it won't really be emitted until it's
// actually needed.
ElfMappingSymbolInfo *EMS = LastEMSInfo.get();
- auto *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
- if (!DF)
+ auto *DF = getCurrentFragment();
+ if (DF->getKind() != MCFragment::FT_Data)
return;
EMS->F = DF;
- EMS->Offset = DF->getContents().size();
+ EMS->Offset = DF->getFixedSize();
LastEMSInfo->State = EMS_Data;
return;
}
@@ -686,7 +686,7 @@ private:
Symbol->setBinding(ELF::STB_LOCAL);
}
- void emitMappingSymbol(StringRef Name, MCDataFragment &F, uint64_t Offset) {
+ void emitMappingSymbol(StringRef Name, MCFragment &F, uint64_t Offset) {
auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
emitLabelAtPos(Symbol, SMLoc(), F, Offset);
Symbol->setType(ELF::STT_NOTYPE);
@@ -1145,9 +1145,8 @@ void ARMTargetELFStreamer::finish() {
auto *Text =
static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
for (auto &F : *Text)
- if (auto *DF = dyn_cast<MCDataFragment>(&F))
- if (!DF->getContents().empty())
- return;
+ if (F.getSize())
+ return;
Text->setFlags(Text->getFlags() | ELF::SHF_ARM_PURECODE);
}
}
@@ -1208,7 +1207,7 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
}
void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
- MCDataFragment *Frag = getOrCreateDataFragment();
+ MCFragment *Frag = getOrCreateDataFragment();
Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind));
}
@@ -1296,7 +1295,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
visitUsedExpr(*PersonalityRef);
- MCDataFragment *DF = getOrCreateDataFragment();
+ MCFragment *DF = getOrCreateDataFragment();
DF->addFixup(
MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4));
}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 2d22b27ceb13..e84aaaad3750 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -152,12 +152,6 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
ARMArchFeature += "+thumb-mode,+v4t";
}
- if (TT.isOSNaCl()) {
- if (!ARMArchFeature.empty())
- ARMArchFeature += ",";
- ARMArchFeature += "+nacl-trap";
- }
-
if (TT.isOSWindows()) {
if (!ARMArchFeature.empty())
ARMArchFeature += ",";
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index c0c40ade5810..354de8fd7b4b 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -192,7 +192,7 @@ void ARMMachObjectWriter::recordARMScatteredHalfRelocation(
// relocation entry in the low 16 bits of r_address field.
unsigned ThumbBit = 0;
unsigned MovtBit = 0;
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default: break;
case ARM::fixup_arm_movt_hi16:
MovtBit = 1;
@@ -465,7 +465,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
// PAIR. I.e. it's correct that we insert the high bits of the addend in the
// MOVW case here. relocation entries.
uint32_t Value = 0;
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default: break;
case ARM::fixup_arm_movw_lo16:
case ARM::fixup_t2_movw_lo16:
diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index 440d852fa4bc..90505aa82aa4 100644
--- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -2531,27 +2531,47 @@ bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
unsigned Flags = MI.getFlags();
TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
- buildMI(MBB, MBBI, AVR::INRdA)
- .addReg(STI.getTmpRegister(), RegState::Define)
- .addImm(STI.getIORegSREG())
- .setMIFlags(Flags);
-
- buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags);
-
- buildMI(MBB, MBBI, AVR::OUTARr)
- .addImm(0x3e)
- .addReg(SrcHiReg, getKillRegState(SrcIsKill))
- .setMIFlags(Flags);
+ // From the XMEGA series manual:
+ // To prevent corruption when updating the stack pointer from software,
+ // a write to SPL will automatically disable interrupts
+ // for up to four instructions or until the next I/O memory write.
+ if (STI.getELFArch() >= 102) { // An XMEGA device
+
+ buildMI(MBB, MBBI, AVR::OUTARr)
+ .addImm(STI.getIORegSPL())
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+ .setMIFlags(Flags);
+
+ buildMI(MBB, MBBI, AVR::OUTARr)
+ .addImm(STI.getIORegSPH())
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+ .setMIFlags(Flags);
+
+ } else { // Disable interrupts for older devices (3 extra instructions)
+
+ buildMI(MBB, MBBI, AVR::INRdA)
+ .addReg(STI.getTmpRegister(), RegState::Define)
+ .addImm(STI.getIORegSREG())
+ .setMIFlags(Flags);
+
+ buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags);
+
+ if (STI.getIORegSPH() != -1)
+ buildMI(MBB, MBBI, AVR::OUTARr)
+ .addImm(STI.getIORegSPH())
+ .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+ .setMIFlags(Flags);
- buildMI(MBB, MBBI, AVR::OUTARr)
- .addImm(STI.getIORegSREG())
- .addReg(STI.getTmpRegister(), RegState::Kill)
- .setMIFlags(Flags);
+ buildMI(MBB, MBBI, AVR::OUTARr)
+ .addImm(STI.getIORegSREG())
+ .addReg(STI.getTmpRegister(), RegState::Kill)
+ .setMIFlags(Flags);
- buildMI(MBB, MBBI, AVR::OUTARr)
- .addImm(0x3d)
- .addReg(SrcLoReg, getKillRegState(SrcIsKill))
- .setMIFlags(Flags);
+ buildMI(MBB, MBBI, AVR::OUTARr)
+ .addImm(STI.getIORegSPL())
+ .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+ .setMIFlags(Flags);
+ }
MI.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AVR/README.md b/llvm/lib/Target/AVR/README.md
index bd8b453aa81e..2bcf63cf7581 100644
--- a/llvm/lib/Target/AVR/README.md
+++ b/llvm/lib/Target/AVR/README.md
@@ -4,5 +4,5 @@ This experimental backend is for the 8-bit Atmel [AVR](https://en.wikipedia.org/
## Useful links
-* [Unresolved bugs](https://llvm.org/bugs/buglist.cgi?product=libraries&component=Backend%3A%20AVR&resolution=---&list_id=109466)
+* [Unresolved bugs](https://github.com/llvm/llvm-project/labels/backend%3AAVR)
* [Architecture notes](https://github.com/avr-llvm/architecture)
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 958790d49d08..dda87537809c 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -90,7 +90,7 @@ void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
Data[Fixup.getOffset() + 1] = 0x1;
support::endian::write32be(&Data[Fixup.getOffset() + 4], Value);
}
- } else if (Fixup.getTargetKind() == BPF::FK_BPF_PCRel_4) {
+ } else if (Fixup.getKind() == BPF::FK_BPF_PCRel_4) {
// The input Value represents the number of bytes.
Value = (uint32_t)((Value - 8) / 8);
support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], Value,
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
index ce1da6e58b9c..694d9eab9694 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -71,7 +71,7 @@ MCFixupKindInfo CSKYAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
MCContext &Ctx) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
llvm_unreachable("Unknown fixup kind!");
case CSKY::fixup_csky_got32:
@@ -157,7 +157,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
}
-bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &,
+ const MCFixup &Fixup,
const MCValue &,
uint64_t Value,
bool Resolved) const {
@@ -166,7 +167,7 @@ bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
return true;
int64_t Offset = int64_t(Value);
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
return false;
case CSKY::fixup_csky_pcrel_imm10_scale2:
@@ -186,7 +187,7 @@ std::optional<bool> CSKYAsmBackend::evaluateFixup(const MCFragment &F,
// For a few PC-relative fixups, offsets need to be aligned down. We
// compensate here because the default handler's `Value` decrement doesn't
// account for this alignment.
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case CSKY::fixup_csky_pcrel_uimm16_scale4:
case CSKY::fixup_csky_pcrel_uimm8_scale4:
case CSKY::fixup_csky_pcrel_uimm7_scale4:
@@ -264,7 +265,7 @@ bool CSKYAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
const MCValue &Target /*STI*/) {
if (Target.getSpecifier())
return true;
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
break;
case CSKY::fixup_csky_doffset_imm18:
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
index 1d3a22c2bbbb..1c8516fbf53a 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -38,7 +38,8 @@ public:
void relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const override;
- bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t,
+ bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &,
+ const MCValue &, uint64_t,
bool) const override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
index 1de82e6cc6ce..d042d26e6ef2 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
@@ -39,7 +39,7 @@ unsigned CSKYELFObjectWriter::getRelocType(const MCFixup &Fixup,
bool IsPCRel) const {
const MCExpr *Expr = Fixup.getValue();
// Determine the type of the relocation
- unsigned Kind = Fixup.getTargetKind();
+ auto Kind = Fixup.getKind();
uint8_t Modifier = Target.getSpecifier();
switch (Target.getSpecifier()) {
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index c97c604fdbf7..d9d9b36d0b73 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -202,7 +202,7 @@ DataScalarizerVisitor::createArrayFromVector(IRBuilder<> &Builder, Value *Vec,
// original vector's defining instruction if available, else immediately after
// the alloca
if (auto *Instr = dyn_cast<Instruction>(Vec))
- Builder.SetInsertPoint(Instr->getNextNonDebugInstruction());
+ Builder.SetInsertPoint(Instr->getNextNode());
SmallVector<Value *, 4> GEPs(ArrNumElems);
for (unsigned I = 0; I < ArrNumElems; ++I) {
Value *EE = Builder.CreateExtractElement(Vec, I, Name + ".extract");
@@ -302,7 +302,7 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
Value *PtrOperand = GEPI.getPointerOperand();
- Type *OrigGEPType = GEPI.getPointerOperandType();
+ Type *OrigGEPType = GEPI.getSourceElementType();
Type *NewGEPType = OrigGEPType;
bool NeedsTransform = false;
@@ -319,6 +319,11 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
}
}
+ // Scalar geps should remain scalars geps. The dxil-flatten-arrays pass will
+ // convert these scalar geps into flattened array geps
+ if (!isa<ArrayType>(OrigGEPType))
+ NewGEPType = OrigGEPType;
+
// Note: We bail if this isn't a gep touched via alloca or global
// transformations
if (!NeedsTransform)
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index 0b7cf2f97017..f0e2e786dfaf 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -20,6 +20,7 @@
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/ReplaceConstant.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <cstddef>
@@ -40,18 +41,19 @@ public:
static char ID; // Pass identification.
};
-struct GEPData {
- ArrayType *ParentArrayType;
- Value *ParentOperand;
- SmallVector<Value *> Indices;
- SmallVector<uint64_t> Dims;
- bool AllIndicesAreConstInt;
+struct GEPInfo {
+ ArrayType *RootFlattenedArrayType;
+ Value *RootPointerOperand;
+ SmallMapVector<Value *, APInt, 4> VariableOffsets;
+ APInt ConstantOffset;
};
class DXILFlattenArraysVisitor
: public InstVisitor<DXILFlattenArraysVisitor, bool> {
public:
- DXILFlattenArraysVisitor() {}
+ DXILFlattenArraysVisitor(
+ SmallDenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap)
+ : GlobalMap(GlobalMap) {}
bool visit(Function &F);
// InstVisitor methods. They return true if the instruction was scalarized,
// false if nothing changed.
@@ -78,7 +80,8 @@ public:
private:
SmallVector<WeakTrackingVH> PotentiallyDeadInstrs;
- DenseMap<GetElementPtrInst *, GEPData> GEPChainMap;
+ SmallDenseMap<GEPOperator *, GEPInfo> GEPChainInfoMap;
+ SmallDenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap;
bool finish();
ConstantInt *genConstFlattenIndices(ArrayRef<Value *> Indices,
ArrayRef<uint64_t> Dims,
@@ -86,27 +89,11 @@ private:
Value *genInstructionFlattenIndices(ArrayRef<Value *> Indices,
ArrayRef<uint64_t> Dims,
IRBuilder<> &Builder);
-
- // Helper function to collect indices and dimensions from a GEP instruction
- void collectIndicesAndDimsFromGEP(GetElementPtrInst &GEP,
- SmallVectorImpl<Value *> &Indices,
- SmallVectorImpl<uint64_t> &Dims,
- bool &AllIndicesAreConstInt);
-
- void
- recursivelyCollectGEPs(GetElementPtrInst &CurrGEP,
- ArrayType *FlattenedArrayType, Value *PtrOperand,
- unsigned &GEPChainUseCount,
- SmallVector<Value *> Indices = SmallVector<Value *>(),
- SmallVector<uint64_t> Dims = SmallVector<uint64_t>(),
- bool AllIndicesAreConstInt = true);
- bool visitGetElementPtrInstInGEPChain(GetElementPtrInst &GEP);
- bool visitGetElementPtrInstInGEPChainBase(GEPData &GEPInfo,
- GetElementPtrInst &GEP);
};
} // namespace
bool DXILFlattenArraysVisitor::finish() {
+ GEPChainInfoMap.clear();
RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs);
return true;
}
@@ -225,131 +212,159 @@ bool DXILFlattenArraysVisitor::visitAllocaInst(AllocaInst &AI) {
return true;
}
-void DXILFlattenArraysVisitor::collectIndicesAndDimsFromGEP(
- GetElementPtrInst &GEP, SmallVectorImpl<Value *> &Indices,
- SmallVectorImpl<uint64_t> &Dims, bool &AllIndicesAreConstInt) {
-
- Type *CurrentType = GEP.getSourceElementType();
-
- // Note index 0 is the ptr index.
- for (Value *Index : llvm::drop_begin(GEP.indices(), 1)) {
- Indices.push_back(Index);
- AllIndicesAreConstInt &= isa<ConstantInt>(Index);
+bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+ // Do not visit GEPs more than once
+ if (GEPChainInfoMap.contains(cast<GEPOperator>(&GEP)))
+ return false;
- if (auto *ArrayTy = dyn_cast<ArrayType>(CurrentType)) {
- Dims.push_back(ArrayTy->getNumElements());
- CurrentType = ArrayTy->getElementType();
- } else {
- assert(false && "Expected array type in GEP chain");
- }
+ Value *PtrOperand = GEP.getPointerOperand();
+ // It shouldn't(?) be possible for the pointer operand of a GEP to be a PHI
+ // node unless HLSL has pointers. If this assumption is incorrect or HLSL gets
+ // pointer types, then the handling of this case can be implemented later.
+ assert(!isa<PHINode>(PtrOperand) &&
+ "Pointer operand of GEP should not be a PHI Node");
+
+ // Replace a GEP ConstantExpr pointer operand with a GEP instruction so that
+ // it can be visited
+ if (auto *PtrOpGEPCE = dyn_cast<ConstantExpr>(PtrOperand);
+ PtrOpGEPCE && PtrOpGEPCE->getOpcode() == Instruction::GetElementPtr) {
+ GetElementPtrInst *OldGEPI =
+ cast<GetElementPtrInst>(PtrOpGEPCE->getAsInstruction());
+ OldGEPI->insertBefore(GEP.getIterator());
+
+ IRBuilder<> Builder(&GEP);
+ SmallVector<Value *> Indices(GEP.indices());
+ Value *NewGEP =
+ Builder.CreateGEP(GEP.getSourceElementType(), OldGEPI, Indices,
+ GEP.getName(), GEP.getNoWrapFlags());
+ assert(isa<GetElementPtrInst>(NewGEP) &&
+ "Expected newly-created GEP to be an instruction");
+ GetElementPtrInst *NewGEPI = cast<GetElementPtrInst>(NewGEP);
+
+ GEP.replaceAllUsesWith(NewGEPI);
+ GEP.eraseFromParent();
+ visitGetElementPtrInst(*OldGEPI);
+ visitGetElementPtrInst(*NewGEPI);
+ return true;
}
-}
-
-void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
- GetElementPtrInst &CurrGEP, ArrayType *FlattenedArrayType,
- Value *PtrOperand, unsigned &GEPChainUseCount, SmallVector<Value *> Indices,
- SmallVector<uint64_t> Dims, bool AllIndicesAreConstInt) {
- // Check if this GEP is already in the map to avoid circular references
- if (GEPChainMap.count(&CurrGEP) > 0)
- return;
- // Collect indices and dimensions from the current GEP
- collectIndicesAndDimsFromGEP(CurrGEP, Indices, Dims, AllIndicesAreConstInt);
- bool IsMultiDimArr = isMultiDimensionalArray(CurrGEP.getSourceElementType());
- if (!IsMultiDimArr) {
- assert(GEPChainUseCount < FlattenedArrayType->getNumElements());
- GEPChainMap.insert(
- {&CurrGEP,
- {std::move(FlattenedArrayType), PtrOperand, std::move(Indices),
- std::move(Dims), AllIndicesAreConstInt}});
- return;
- }
- bool GepUses = false;
- for (auto *User : CurrGEP.users()) {
- if (GetElementPtrInst *NestedGEP = dyn_cast<GetElementPtrInst>(User)) {
- recursivelyCollectGEPs(*NestedGEP, FlattenedArrayType, PtrOperand,
- ++GEPChainUseCount, Indices, Dims,
- AllIndicesAreConstInt);
- GepUses = true;
- }
- }
- // This case is just incase the gep chain doesn't end with a 1d array.
- if (IsMultiDimArr && GEPChainUseCount > 0 && !GepUses) {
- GEPChainMap.insert(
- {&CurrGEP,
- {std::move(FlattenedArrayType), PtrOperand, std::move(Indices),
- std::move(Dims), AllIndicesAreConstInt}});
+ // Construct GEPInfo for this GEP
+ GEPInfo Info;
+
+ // Obtain the variable and constant byte offsets computed by this GEP
+ const DataLayout &DL = GEP.getDataLayout();
+ unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP.getType());
+ Info.ConstantOffset = {BitWidth, 0};
+ [[maybe_unused]] bool Success = GEP.collectOffset(
+ DL, BitWidth, Info.VariableOffsets, Info.ConstantOffset);
+ assert(Success && "Failed to collect offsets for GEP");
+
+ // If there is a parent GEP, inherit the root array type and pointer, and
+ // merge the byte offsets. Otherwise, this GEP is itself the root of a GEP
+ // chain and we need to deterine the root array type
+ if (auto *PtrOpGEP = dyn_cast<GEPOperator>(PtrOperand)) {
+ assert(GEPChainInfoMap.contains(PtrOpGEP) &&
+ "Expected parent GEP to be visited before this GEP");
+ GEPInfo &PGEPInfo = GEPChainInfoMap[PtrOpGEP];
+ Info.RootFlattenedArrayType = PGEPInfo.RootFlattenedArrayType;
+ Info.RootPointerOperand = PGEPInfo.RootPointerOperand;
+ for (auto &VariableOffset : PGEPInfo.VariableOffsets)
+ Info.VariableOffsets.insert(VariableOffset);
+ Info.ConstantOffset += PGEPInfo.ConstantOffset;
+ } else {
+ Info.RootPointerOperand = PtrOperand;
+
+ // We should try to determine the type of the root from the pointer rather
+ // than the GEP's source element type because this could be a scalar GEP
+ // into an array-typed pointer from an Alloca or Global Variable.
+ Type *RootTy = GEP.getSourceElementType();
+ if (auto *GlobalVar = dyn_cast<GlobalVariable>(PtrOperand)) {
+ if (GlobalMap.contains(GlobalVar))
+ GlobalVar = GlobalMap[GlobalVar];
+ Info.RootPointerOperand = GlobalVar;
+ RootTy = GlobalVar->getValueType();
+ } else if (auto *Alloca = dyn_cast<AllocaInst>(PtrOperand))
+ RootTy = Alloca->getAllocatedType();
+ assert(!isMultiDimensionalArray(RootTy) &&
+ "Expected root array type to be flattened");
+
+ // If the root type is not an array, we don't need to do any flattening
+ if (!isa<ArrayType>(RootTy))
+ return false;
+
+ Info.RootFlattenedArrayType = cast<ArrayType>(RootTy);
}
-}
-bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChain(
- GetElementPtrInst &GEP) {
- GEPData GEPInfo = GEPChainMap.at(&GEP);
- return visitGetElementPtrInstInGEPChainBase(GEPInfo, GEP);
-}
-bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChainBase(
- GEPData &GEPInfo, GetElementPtrInst &GEP) {
- IRBuilder<> Builder(&GEP);
- Value *FlatIndex;
- if (GEPInfo.AllIndicesAreConstInt)
- FlatIndex = genConstFlattenIndices(GEPInfo.Indices, GEPInfo.Dims, Builder);
- else
- FlatIndex =
- genInstructionFlattenIndices(GEPInfo.Indices, GEPInfo.Dims, Builder);
-
- ArrayType *FlattenedArrayType = GEPInfo.ParentArrayType;
-
- // Don't append '.flat' to an empty string. If the SSA name isn't available
- // it could conflict with the ParentOperand's name.
- std::string FlatName = GEP.hasName() ? GEP.getName().str() + ".flat" : "";
-
- Value *FlatGEP = Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParentOperand,
- {Builder.getInt32(0), FlatIndex}, FlatName,
- GEP.getNoWrapFlags());
-
- // Note: Old gep will become an invalid instruction after replaceAllUsesWith.
- // Erase the old GEP in the map before to avoid invalid instructions
- // and circular references.
- GEPChainMap.erase(&GEP);
-
- GEP.replaceAllUsesWith(FlatGEP);
- GEP.eraseFromParent();
- return true;
-}
-
-bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
- auto It = GEPChainMap.find(&GEP);
- if (It != GEPChainMap.end())
- return visitGetElementPtrInstInGEPChain(GEP);
- if (!isMultiDimensionalArray(GEP.getSourceElementType()))
- return false;
-
- ArrayType *ArrType = cast<ArrayType>(GEP.getSourceElementType());
- IRBuilder<> Builder(&GEP);
- auto [TotalElements, BaseType] = getElementCountAndType(ArrType);
- ArrayType *FlattenedArrayType = ArrayType::get(BaseType, TotalElements);
-
- Value *PtrOperand = GEP.getPointerOperand();
+ // GEPs without users or GEPs with non-GEP users should be replaced such that
+ // the chain of GEPs they are a part of are collapsed to a single GEP into a
+ // flattened array.
+ bool ReplaceThisGEP = GEP.users().empty();
+ for (Value *User : GEP.users())
+ if (!isa<GetElementPtrInst>(User))
+ ReplaceThisGEP = true;
+
+ if (ReplaceThisGEP) {
+ unsigned BytesPerElem =
+ DL.getTypeAllocSize(Info.RootFlattenedArrayType->getArrayElementType());
+ assert(isPowerOf2_32(BytesPerElem) &&
+ "Bytes per element should be a power of 2");
+
+ // Compute the 32-bit index for this flattened GEP from the constant and
+ // variable byte offsets in the GEPInfo
+ IRBuilder<> Builder(&GEP);
+ Value *ZeroIndex = Builder.getInt32(0);
+ uint64_t ConstantOffset =
+ Info.ConstantOffset.udiv(BytesPerElem).getZExtValue();
+ assert(ConstantOffset < UINT32_MAX &&
+ "Constant byte offset for flat GEP index must fit within 32 bits");
+ Value *FlattenedIndex = Builder.getInt32(ConstantOffset);
+ for (auto [VarIndex, Multiplier] : Info.VariableOffsets) {
+ assert(Multiplier.getActiveBits() <= 32 &&
+ "The multiplier for a flat GEP index must fit within 32 bits");
+ assert(VarIndex->getType()->isIntegerTy(32) &&
+ "Expected i32-typed GEP indices");
+ Value *VI;
+ if (Multiplier.getZExtValue() % BytesPerElem != 0) {
+ // This can happen, e.g., with i8 GEPs. To handle this we just divide
+ // by BytesPerElem using an instruction after multiplying VarIndex by
+ // Multiplier.
+ VI = Builder.CreateMul(VarIndex,
+ Builder.getInt32(Multiplier.getZExtValue()));
+ VI = Builder.CreateLShr(VI, Builder.getInt32(Log2_32(BytesPerElem)));
+ } else
+ VI = Builder.CreateMul(
+ VarIndex,
+ Builder.getInt32(Multiplier.getZExtValue() / BytesPerElem));
+ FlattenedIndex = Builder.CreateAdd(FlattenedIndex, VI);
+ }
- unsigned GEPChainUseCount = 0;
- recursivelyCollectGEPs(GEP, FlattenedArrayType, PtrOperand, GEPChainUseCount);
-
- // NOTE: hasNUses(0) is not the same as GEPChainUseCount == 0.
- // Here recursion is used to get the length of the GEP chain.
- // Handle zero uses here because there won't be an update via
- // a child in the chain later.
- if (GEPChainUseCount == 0) {
- SmallVector<Value *> Indices;
- SmallVector<uint64_t> Dims;
- bool AllIndicesAreConstInt = true;
-
- // Collect indices and dimensions from the GEP
- collectIndicesAndDimsFromGEP(GEP, Indices, Dims, AllIndicesAreConstInt);
- GEPData GEPInfo{std::move(FlattenedArrayType), PtrOperand,
- std::move(Indices), std::move(Dims), AllIndicesAreConstInt};
- return visitGetElementPtrInstInGEPChainBase(GEPInfo, GEP);
+ // Construct a new GEP for the flattened array to replace the current GEP
+ Value *NewGEP = Builder.CreateGEP(
+ Info.RootFlattenedArrayType, Info.RootPointerOperand,
+ {ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags());
+
+ // If the pointer operand is a global variable and all indices are 0,
+ // IRBuilder::CreateGEP will return the global variable instead of creating
+ // a GEP instruction or GEP ConstantExpr. In this case we have to create and
+ // insert our own GEP instruction.
+ if (!isa<GEPOperator>(NewGEP))
+ NewGEP = GetElementPtrInst::Create(
+ Info.RootFlattenedArrayType, Info.RootPointerOperand,
+ {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(),
+ Builder.GetInsertPoint());
+
+ // Replace the current GEP with the new GEP. Store GEPInfo into the map
+ // for later use in case this GEP was not the end of the chain
+ GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)});
+ GEP.replaceAllUsesWith(NewGEP);
+ GEP.eraseFromParent();
+ return true;
}
+ // This GEP is potentially dead at the end of the pass since it may not have
+ // any users anymore after GEP chains have been collapsed. We retain store
+ // GEPInfo for GEPs down the chain to use to compute their indices.
+ GEPChainInfoMap.insert({cast<GEPOperator>(&GEP), std::move(Info)});
PotentiallyDeadInstrs.emplace_back(&GEP);
return false;
}
@@ -416,9 +431,8 @@ static Constant *transformInitializer(Constant *Init, Type *OrigType,
return ConstantArray::get(FlattenedType, FlattenedElements);
}
-static void
-flattenGlobalArrays(Module &M,
- DenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap) {
+static void flattenGlobalArrays(
+ Module &M, SmallDenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap) {
LLVMContext &Ctx = M.getContext();
for (GlobalVariable &G : M.globals()) {
Type *OrigType = G.getValueType();
@@ -456,9 +470,9 @@ flattenGlobalArrays(Module &M,
static bool flattenArrays(Module &M) {
bool MadeChange = false;
- DXILFlattenArraysVisitor Impl;
- DenseMap<GlobalVariable *, GlobalVariable *> GlobalMap;
+ SmallDenseMap<GlobalVariable *, GlobalVariable *> GlobalMap;
flattenGlobalArrays(M, GlobalMap);
+ DXILFlattenArraysVisitor Impl(GlobalMap);
for (auto &F : make_early_inc_range(M.functions())) {
if (F.isDeclaration())
continue;
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
index 76a46c7a2b76..c73648f21e8d 100644
--- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
+++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -98,9 +98,9 @@ static void fixI8UseChain(Instruction &I,
ElementType = AI->getAllocatedType();
if (auto *GEP = dyn_cast<GetElementPtrInst>(NewOperands[0])) {
ElementType = GEP->getSourceElementType();
- if (ElementType->isArrayTy())
- ElementType = ElementType->getArrayElementType();
}
+ if (ElementType->isArrayTy())
+ ElementType = ElementType->getArrayElementType();
LoadInst *NewLoad = Builder.CreateLoad(ElementType, NewOperands[0]);
ReplacedValues[Load] = NewLoad;
ToRemove.push_back(Load);
@@ -347,7 +347,6 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src,
if (ByteLength == 0)
return;
- LLVMContext &Ctx = Builder.getContext();
const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
auto GetArrTyFromVal = [](Value *Val) -> ArrayType * {
@@ -392,10 +391,11 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src,
assert(ByteLength % DstElemByteSize == 0 &&
"memcpy length must be divisible by array element type");
for (uint64_t I = 0; I < NumElemsToCopy; ++I) {
- Value *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), I);
- Value *SrcPtr = Builder.CreateInBoundsGEP(SrcElemTy, Src, Offset, "gep");
+ SmallVector<Value *, 2> Indices = {Builder.getInt32(0),
+ Builder.getInt32(I)};
+ Value *SrcPtr = Builder.CreateInBoundsGEP(SrcArrTy, Src, Indices, "gep");
Value *SrcVal = Builder.CreateLoad(SrcElemTy, SrcPtr);
- Value *DstPtr = Builder.CreateInBoundsGEP(DstElemTy, Dst, Offset, "gep");
+ Value *DstPtr = Builder.CreateInBoundsGEP(DstArrTy, Dst, Indices, "gep");
Builder.CreateStore(SrcVal, DstPtr);
}
}
@@ -403,7 +403,6 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src,
static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val,
ConstantInt *SizeCI,
DenseMap<Value *, Value *> &ReplacedValues) {
- LLVMContext &Ctx = Builder.getContext();
[[maybe_unused]] const DataLayout &DL =
Builder.GetInsertBlock()->getModule()->getDataLayout();
[[maybe_unused]] uint64_t OrigSize = SizeCI->getZExtValue();
@@ -444,8 +443,9 @@ static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val,
}
for (uint64_t I = 0; I < Size; ++I) {
- Value *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), I);
- Value *Ptr = Builder.CreateGEP(ElemTy, Dst, Offset, "gep");
+ Value *Zero = Builder.getInt32(0);
+ Value *Offset = Builder.getInt32(I);
+ Value *Ptr = Builder.CreateGEP(ArrTy, Dst, {Zero, Offset}, "gep");
Builder.CreateStore(TypedVal, Ptr);
}
}
@@ -478,9 +478,9 @@ static void legalizeMemCpy(Instruction &I,
ToRemove.push_back(CI);
}
-static void removeMemSet(Instruction &I,
- SmallVectorImpl<Instruction *> &ToRemove,
- DenseMap<Value *, Value *> &ReplacedValues) {
+static void legalizeMemSet(Instruction &I,
+ SmallVectorImpl<Instruction *> &ToRemove,
+ DenseMap<Value *, Value *> &ReplacedValues) {
CallInst *CI = dyn_cast<CallInst>(&I);
if (!CI)
@@ -562,6 +562,53 @@ legalizeGetHighLowi64Bytes(Instruction &I,
}
}
+static void
+legalizeScalarLoadStoreOnArrays(Instruction &I,
+ SmallVectorImpl<Instruction *> &ToRemove,
+ DenseMap<Value *, Value *> &) {
+
+ Value *PtrOp;
+ unsigned PtrOpIndex;
+ [[maybe_unused]] Type *LoadStoreTy;
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ PtrOp = LI->getPointerOperand();
+ PtrOpIndex = LI->getPointerOperandIndex();
+ LoadStoreTy = LI->getType();
+ } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ PtrOp = SI->getPointerOperand();
+ PtrOpIndex = SI->getPointerOperandIndex();
+ LoadStoreTy = SI->getValueOperand()->getType();
+ } else
+ return;
+
+ // If the load/store is not of a single-value type (i.e., scalar or vector)
+ // then we do not modify it. It shouldn't be a vector either because the
+ // dxil-data-scalarization pass is expected to run before this, but it's not
+ // incorrect to apply this transformation to vector load/stores.
+ if (!LoadStoreTy->isSingleValueType())
+ return;
+
+ Type *ArrayTy;
+ if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
+ ArrayTy = GlobalVarPtrOp->getValueType();
+ else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
+ ArrayTy = AllocaPtrOp->getAllocatedType();
+ else
+ return;
+
+ if (!isa<ArrayType>(ArrayTy))
+ return;
+
+ assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
+ "Expected array element type to be the same as to the scalar load or "
+ "store type");
+
+ Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0);
+ Value *GEP = GetElementPtrInst::Create(
+ ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
+ I.setOperand(PtrOpIndex, GEP);
+}
+
namespace {
class DXILLegalizationPipeline {
@@ -603,7 +650,7 @@ private:
LegalizationPipeline[Stage1].push_back(legalizeGetHighLowi64Bytes);
LegalizationPipeline[Stage1].push_back(legalizeFreeze);
LegalizationPipeline[Stage1].push_back(legalizeMemCpy);
- LegalizationPipeline[Stage1].push_back(removeMemSet);
+ LegalizationPipeline[Stage1].push_back(legalizeMemSet);
LegalizationPipeline[Stage1].push_back(updateFnegToFsub);
// Note: legalizeGetHighLowi64Bytes and
// downcastI64toI32InsertExtractElements both modify extractelement, so they
@@ -612,6 +659,7 @@ private:
// downcastI64toI32InsertExtractElements needs to handle.
LegalizationPipeline[Stage2].push_back(
downcastI64toI32InsertExtractElements);
+ LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays);
}
};
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 40fe6c6e639e..84751d2db226 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -107,10 +107,10 @@ public:
addPass(createDXILIntrinsicExpansionLegacyPass());
addPass(createDXILCBufferAccessLegacyPass());
addPass(createDXILDataScalarizationLegacyPass());
- addPass(createDXILFlattenArraysLegacyPass());
ScalarizerPassOptions DxilScalarOptions;
DxilScalarOptions.ScalarizeLoadStore = true;
addPass(createScalarizerPass(DxilScalarOptions));
+ addPass(createDXILFlattenArraysLegacyPass());
addPass(createDXILForwardHandleAccessesLegacyPass());
addPass(createDXILLegalizeLegacyPass());
addPass(createDXILResourceImplicitBindingLegacyPass());
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 5bd31707acb6..22cff7c80fa0 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -43,12 +43,12 @@ namespace {
class HexagonDisassembler : public MCDisassembler {
public:
std::unique_ptr<MCInstrInfo const> const MCII;
- std::unique_ptr<MCInst *> CurrentBundle;
+ mutable std::unique_ptr<MCInst> CurrentBundle;
mutable MCInst const *CurrentExtender;
HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
MCInstrInfo const *MCII)
- : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *),
+ : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(nullptr),
CurrentExtender(nullptr) {}
DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
@@ -57,7 +57,23 @@ public:
DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &CStream) const override;
+
+ DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &CStream) const override;
+
void remapInstruction(MCInst &Instr) const;
+
+private:
+ bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &BytesToSkip, raw_ostream &CS) const;
+
+ void resetBundle() const {
+ CurrentBundle.reset();
+ CurrentInstruction = nullptr;
+ }
+
+ mutable MCOperand *CurrentInstruction = nullptr;
};
static uint64_t fullValue(HexagonDisassembler const &Disassembler, MCInst &MI,
@@ -171,43 +187,88 @@ LLVMInitializeHexagonDisassembler() {
createHexagonDisassembler);
}
-DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
- ArrayRef<uint8_t> Bytes,
- uint64_t Address,
- raw_ostream &CS) const {
- CommentStream = &CS;
-
- DecodeStatus Result = DecodeStatus::Success;
+bool HexagonDisassembler::makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &BytesToSkip,
+ raw_ostream &CS) const {
bool Complete = false;
- Size = 0;
+ DecodeStatus Result = DecodeStatus::Success;
- *CurrentBundle = &MI;
- MI.setOpcode(Hexagon::BUNDLE);
- MI.addOperand(MCOperand::createImm(0));
+ CurrentBundle.reset(new MCInst);
+ CurrentBundle->setOpcode(Hexagon::BUNDLE);
+ CurrentBundle->addOperand(MCOperand::createImm(0));
while (Result == Success && !Complete) {
if (Bytes.size() < HEXAGON_INSTR_SIZE)
- return MCDisassembler::Fail;
+ return false;
MCInst *Inst = getContext().createMCInst();
- Result = getSingleInstruction(*Inst, MI, Bytes, Address, CS, Complete);
- MI.addOperand(MCOperand::createInst(Inst));
- Size += HEXAGON_INSTR_SIZE;
+ Result = getSingleInstruction(*Inst, *CurrentBundle, Bytes, Address, CS,
+ Complete);
+ CurrentBundle->addOperand(MCOperand::createInst(Inst));
+ BytesToSkip += HEXAGON_INSTR_SIZE;
Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
}
if (Result == MCDisassembler::Fail)
- return Result;
- if (Size > HEXAGON_MAX_PACKET_SIZE)
- return MCDisassembler::Fail;
+ return false;
+ if (BytesToSkip > HEXAGON_MAX_PACKET_SIZE)
+ return false;
const auto ArchSTI = Hexagon_MC::getArchSubtarget(&STI);
const auto STI_ = (ArchSTI != nullptr) ? *ArchSTI : STI;
- HexagonMCChecker Checker(getContext(), *MCII, STI_, MI,
+ HexagonMCChecker Checker(getContext(), *MCII, STI_, *CurrentBundle,
*getContext().getRegisterInfo(), false);
if (!Checker.check())
- return MCDisassembler::Fail;
- remapInstruction(MI);
+ return false;
+ remapInstruction(*CurrentBundle);
+ return true;
+}
+
+DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+
+ Size = 0;
+ uint64_t BytesToSkip = 0;
+
+ if (!CurrentBundle) {
+ if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+ Size = BytesToSkip;
+ resetBundle();
+ return MCDisassembler::Fail;
+ }
+ CurrentInstruction = (CurrentBundle->begin() + 1);
+ }
+
+ MI = *(CurrentInstruction->getInst());
+ Size = HEXAGON_INSTR_SIZE;
+ if (++CurrentInstruction == CurrentBundle->end())
+ resetBundle();
return MCDisassembler::Success;
}
+DecodeStatus HexagonDisassembler::getInstructionBundle(MCInst &MI,
+ uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+ Size = 0;
+ uint64_t BytesToSkip = 0;
+ assert(!CurrentBundle);
+
+ if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+ Size = BytesToSkip;
+ resetBundle();
+ return MCDisassembler::Fail;
+ }
+
+ MI = *CurrentBundle;
+ Size = HEXAGON_INSTR_SIZE * HexagonMCInstrInfo::bundleSize(MI);
+ resetBundle();
+
+ return Success;
+}
+
void HexagonDisassembler::remapInstruction(MCInst &Instr) const {
for (auto I: HexagonMCInstrInfo::bundleInstructions(Instr)) {
auto &MI = const_cast<MCInst &>(*I.getInst());
@@ -482,7 +543,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
unsigned Offset = 1;
bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI);
bool PrevVector = false;
- auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+ auto Instructions = HexagonMCInstrInfo::bundleInstructions(*CurrentBundle);
auto i = Instructions.end() - 1;
for (auto n = Instructions.begin() - 1;; --i, ++Offset) {
if (i == n)
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index bb7814c5226f..35da34ed0a89 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -1005,7 +1005,7 @@ bool MachineConstPropagator::rewrite(MachineFunction &MF) {
SmallVector<MachineBasicBlock*,2> ToRemove;
for (MachineBasicBlock *SB : B->successors()) {
if (!Targets.count(SB))
- ToRemove.push_back(const_cast<MachineBasicBlock*>(SB));
+ ToRemove.push_back(SB);
Targets.remove(SB);
}
for (MachineBasicBlock *MBB : ToRemove)
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 53943de3bc59..e285e0454369 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
R = N;
break;
}
+ case ISD::AssertSext: {
+ EVT T = cast<VTSDNode>(N.getOperand(1))->getVT();
+ if (T.getSizeInBits() == 32)
+ R = N.getOperand(0);
+ else
+ return false;
+ break;
+ }
+
default:
return false;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index ec73e58ce5d4..facea646d4b6 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -236,7 +236,16 @@ MVT HexagonTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
SDValue
HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
const {
- return SDValue();
+ unsigned IntNo = Op.getConstantOperandVal(0);
+ SDLoc dl(Op);
+ switch (IntNo) {
+ default:
+ return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::thread_pointer: {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ return DAG.getNode(HexagonISD::THREAD_POINTER, dl, PtrVT);
+ }
+ }
}
/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
@@ -1588,6 +1597,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
@@ -1963,6 +1973,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::VROR: return "HexagonISD::VROR";
case HexagonISD::READCYCLE: return "HexagonISD::READCYCLE";
case HexagonISD::READTIMER: return "HexagonISD::READTIMER";
+ case HexagonISD::THREAD_POINTER:
+ return "HexagonISD::THREAD_POINTER";
case HexagonISD::PTRUE: return "HexagonISD::PTRUE";
case HexagonISD::PFALSE: return "HexagonISD::PFALSE";
case HexagonISD::D2P: return "HexagonISD::D2P";
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index f9e5478f457f..9ebbbc6399b4 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -31,6 +31,7 @@ namespace llvm {
namespace HexagonISD {
+// clang-format off
enum NodeType : unsigned {
OP_BEGIN = ISD::BUILTIN_OP_END,
@@ -78,6 +79,7 @@ enum NodeType : unsigned {
DCFETCH,
READCYCLE,
READTIMER,
+ THREAD_POINTER,
PTRUE,
PFALSE,
D2P, // Convert 8-byte value to 8-bit predicate register. [*]
@@ -121,6 +123,7 @@ enum NodeType : unsigned {
};
} // end namespace HexagonISD
+// clang-format on
class HexagonSubtarget;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 82d999ad820e..4b236708ca6d 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -3432,6 +3432,11 @@ def HexagonREADTIMER: SDNode<"HexagonISD::READTIMER", SDTInt64Leaf,
def: Pat<(HexagonREADTIMER), (A4_tfrcpp UTIMER)>;
+def SDTInt32Leaf : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def HexagonTHREADPOINTER : SDNode<"HexagonISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def : Pat<(HexagonTHREADPOINTER), (i32(COPY UGP))>;
+
// The declared return value of the store-locked intrinsics is i32, but
// the instructions actually define i1. To avoid register copies from
// IntRegs to PredRegs and back, fold the entire pattern checking the
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index de7bd5d4b2c6..7d3074ba6b5d 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -46,16 +46,15 @@ class HexagonAsmBackend : public MCAsmBackend {
MCInst * Extender;
unsigned MaxPacketSize;
- void ReplaceInstruction(MCCodeEmitter &E, MCRelaxableFragment &RF,
- MCInst &HMB) const {
+ void ReplaceInstruction(MCCodeEmitter &E, MCFragment &RF, MCInst &HMB) const {
SmallVector<MCFixup, 4> Fixups;
SmallString<256> Code;
E.encodeInstruction(HMB, Code, Fixups, *RF.getSubtargetInfo());
// Update the fragment.
RF.setInst(HMB);
- RF.setContents(Code);
- RF.getFixups() = Fixups;
+ RF.setVarContents(Code);
+ RF.setVarFixups(Fixups);
}
public:
@@ -200,7 +199,7 @@ public:
}
bool shouldForceRelocation(const MCFixup &Fixup) {
- switch(Fixup.getTargetKind()) {
+ switch(Fixup.getKind()) {
default:
llvm_unreachable("Unknown Fixup Kind!");
@@ -438,21 +437,21 @@ public:
/// fixupNeedsRelaxation - Target specific predicate for whether a given
/// fixup requires the associated instruction to be relaxed.
- bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, const MCValue &,
- uint64_t Value,
+ bool fixupNeedsRelaxationAdvanced(const MCFragment &F, const MCFixup &Fixup,
+ const MCValue &, uint64_t Value,
bool Resolved) const override {
MCInst const &MCB = RelaxedMCB;
assert(HexagonMCInstrInfo::isBundle(MCB));
*RelaxTarget = nullptr;
MCInst &MCI = const_cast<MCInst &>(HexagonMCInstrInfo::instruction(
- MCB, Fixup.getOffset() / HEXAGON_INSTR_SIZE));
+ MCB, (Fixup.getOffset() - F.getFixedSize()) / HEXAGON_INSTR_SIZE));
bool Relaxable = isInstRelaxable(MCI);
if (Relaxable == false)
return false;
// If we cannot resolve the fixup value, it requires relaxation.
if (!Resolved) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case fixup_Hexagon_B22_PCREL:
// GetFixupCount assumes B22 won't relax
[[fallthrough]];
@@ -595,7 +594,7 @@ public:
}
case MCFragment::FT_Relaxable: {
MCContext &Context = getContext();
- auto &RF = cast<MCRelaxableFragment>(*Frags[K]);
+ auto &RF = *Frags[K];
MCInst Inst = RF.getInst();
const bool WouldTraverseLabel = llvm::any_of(
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index ed381c33225d..9752f3a13120 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -56,7 +56,7 @@ unsigned HexagonELFObjectWriter::getRelocType(const MCFixup &Fixup,
default:
break;
}
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
report_fatal_error("Unrecognized relocation type");
break;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 9030e43b7149..f83e06cd3d93 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -33,30 +33,18 @@ void HexagonInstPrinter::printRegName(raw_ostream &O, MCRegister Reg) {
void HexagonInstPrinter::printInst(const MCInst *MI, uint64_t Address,
StringRef Annot, const MCSubtargetInfo &STI,
raw_ostream &OS) {
- assert(HexagonMCInstrInfo::isBundle(*MI));
- assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
- assert(HexagonMCInstrInfo::bundleSize(*MI) > 0);
- HasExtender = false;
- for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
- MCInst const &MCI = *I.getInst();
- if (HexagonMCInstrInfo::isDuplex(MII, MCI)) {
- printInstruction(MCI.getOperand(1).getInst(), Address, OS);
- OS << '\v';
- HasExtender = false;
- printInstruction(MCI.getOperand(0).getInst(), Address, OS);
- } else
- printInstruction(&MCI, Address, OS);
- HasExtender = HexagonMCInstrInfo::isImmext(MCI);
- OS << "\n";
- }
-
- bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(*MI);
- bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(*MI);
- if (IsLoop0) {
- OS << (IsLoop1 ? " :endloop01" : " :endloop0");
- } else if (IsLoop1) {
- OS << " :endloop1";
+ if (HexagonMCInstrInfo::isDuplex(MII, *MI)) {
+ printInstruction(MI->getOperand(1).getInst(), Address, OS);
+ OS << '\v';
+ HasExtender = false;
+ printInstruction(MI->getOperand(0).getInst(), Address, OS);
+ } else {
+ printInstruction(MI, Address, OS);
}
+ HasExtender = HexagonMCInstrInfo::isImmext(*MI);
+ if ((MI->getOpcode() & HexagonII::INST_PARSE_MASK) ==
+ HexagonII::INST_PARSE_PACKET_END)
+ HasExtender = false;
}
void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 980df819b2c2..bfea50e2d6dc 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -252,8 +252,21 @@ public:
std::string Buffer;
{
raw_string_ostream TempStream(Buffer);
- InstPrinter.printInst(&Inst, Address, "", STI, TempStream);
+ for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+ InstPrinter.printInst(I.getInst(), Address, "", STI, TempStream);
+ TempStream << "\n";
+ }
+ }
+
+ std::string LoopString = "";
+ bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(Inst);
+ bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(Inst);
+ if (IsLoop0) {
+ LoopString += (IsLoop1 ? " :endloop01" : " :endloop0");
+ } else if (IsLoop1) {
+ LoopString += " :endloop1";
}
+
StringRef Contents(Buffer);
auto PacketBundle = Contents.rsplit('\n');
auto HeadTail = PacketBundle.first.split('\n');
@@ -275,9 +288,9 @@ public:
}
if (HexagonMCInstrInfo::isMemReorderDisabled(Inst))
- OS << "\n\t} :mem_noshuf" << PacketBundle.second;
+ OS << "\n\t} :mem_noshuf" << LoopString;
else
- OS << "\t}" << PacketBundle.second;
+ OS << "\t}" << LoopString;
}
void finish() override { finishAttributeSection(); }
diff --git a/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp b/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
index 93beaec7eeff..3c3924bd5018 100644
--- a/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -59,8 +59,7 @@ void LanaiFrameLowering::determineFrameLayout(MachineFunction &MF) const {
// ADJDYNALLOC pseudo instructions with a Lanai:ADDI with the
// maximum call frame size as the immediate.
void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const {
- const LanaiInstrInfo &LII =
- *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+ const LanaiInstrInfo &LII = *STI.getInstrInfo();
unsigned MaxCallFrameSize = MF.getFrameInfo().getMaxCallFrameSize();
for (MachineBasicBlock &MBB : MF) {
@@ -88,8 +87,7 @@ void LanaiFrameLowering::emitPrologue(MachineFunction &MF,
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
MachineFrameInfo &MFI = MF.getFrameInfo();
- const LanaiInstrInfo &LII =
- *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+ const LanaiInstrInfo &LII = *STI.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.begin();
// Debug location must be unknown since the first debug location is used
@@ -173,8 +171,7 @@ MachineBasicBlock::iterator LanaiFrameLowering::eliminateCallFramePseudoInstr(
void LanaiFrameLowering::emitEpilogue(MachineFunction & /*MF*/,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
- const LanaiInstrInfo &LII =
- *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+ const LanaiInstrInfo &LII = *STI.getInstrInfo();
DebugLoc DL = MBBI->getDebugLoc();
// Restore the stack pointer using the callee's frame pointer value.
@@ -195,8 +192,7 @@ void LanaiFrameLowering::determineCalleeSaves(MachineFunction &MF,
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
MachineFrameInfo &MFI = MF.getFrameInfo();
- const LanaiRegisterInfo *LRI =
- static_cast<const LanaiRegisterInfo *>(STI.getRegisterInfo());
+ const LanaiRegisterInfo *LRI = STI.getRegisterInfo();
int Offset = -4;
// Reserve 4 bytes for the saved RCA
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index d5a5f17348e4..36c3011be2b9 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This file describes the baisc single-precision floating-point instructions.
+// This file describes the basic single-precision floating-point instructions.
//
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index ac5e7f3891c7..1493bf4cba69 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -158,7 +158,12 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized(
// estimateStackSize has been observed to under-estimate the final stack
// size, so give ourselves wiggle-room by checking for stack size
// representable an 11-bit signed field rather than 12-bits.
- if (!isInt<11>(MFI.estimateStackSize(MF)))
+ // For [x]vstelm.{b/h/w/d} memory instructions with 8 imm offset, 7-bit
+ // signed field is fine.
+ unsigned EstimateStackSize = MFI.estimateStackSize(MF);
+ if (!isInt<11>(EstimateStackSize) ||
+ (MF.getSubtarget<LoongArchSubtarget>().hasExtLSX() &&
+ !isInt<7>(EstimateStackSize)))
ScavSlotsNum = std::max(ScavSlotsNum, 1u);
// For CFR spill.
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index c47987fbf683..2378664ca815 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2597,12 +2597,9 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
EVT VecTy = Op->getOperand(0)->getValueType(0);
SDValue Idx = Op->getOperand(1);
- EVT EltTy = VecTy.getVectorElementType();
unsigned NumElts = VecTy.getVectorNumElements();
- if (isa<ConstantSDNode>(Idx) &&
- (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 ||
- EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2))
+ if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts)
return Op;
return SDValue();
@@ -6003,10 +6000,9 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
Register ScratchReg1 = XSrc;
if (Idx >= HalfSize) {
ScratchReg1 = MRI.createVirtualRegister(RC);
- BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1)
+ BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1)
.addReg(XSrc)
- .addReg(XSrc)
- .addImm(1);
+ .addImm(14);
}
Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 95e9fd49d1c0..a0107e44b421 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1282,6 +1282,32 @@ multiclass PatCCXrXrF<CondCode CC, string Inst> {
(!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
}
+multiclass PairInsertExtractPatV8<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...3 in {
+ foreach imm2 = 0...3 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert vecty:$xd,
+ (elemty (vector_extract vecty:$xj, imm1)), imm2),
+ (elemty (vector_extract vecty:$xj, !add(imm1, 4))),
+ !add(imm2, 4)),
+ (XVEXTRINS_W $xd, $xj, Imm)>;
+ }
+ }
+}
+
+multiclass PairInsertExtractPatV4<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...1 in {
+ foreach imm2 = 0...1 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert vecty:$xd,
+ (elemty (vector_extract vecty:$xj, imm1)), imm2),
+ (elemty (vector_extract vecty:$xj, !add(imm1, 2))),
+ !add(imm2, 2)),
+ (XVEXTRINS_D $xd, $xj, Imm)>;
+ }
+ }
+}
+
let Predicates = [HasExtLASX] in {
// XVADD_{B/H/W/D}
@@ -1582,6 +1608,38 @@ defm : PatCCXrXrF<SETUNE, "XVFCMP_CUNE">;
defm : PatCCXrXrF<SETO, "XVFCMP_COR">;
defm : PatCCXrXrF<SETUO, "XVFCMP_CUN">;
+// Insert two elements extracted from vector into vector. (The positions
+// of the two elements must be same in the source or destination vector's
+// front and back 128bits.)
+// 2*XVPICKVE2GR_{W/D} + 2*XVINSGR2VR_{W/D} -> XVEXTRINS_{W/D}
+// XVPERMI_D + 2*XVPICKVE2GR_{B/H} + 2*PseudoXVINSGR2VR_{B/H} -> XVEXTRINS_{W/D}
+foreach imm1 = 0...15 in {
+ foreach imm2 = 0...15 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert v32i8:$xd,
+ (GRLenVT (vector_extract v32i8:$xj, imm1)), imm2),
+ (GRLenVT (vector_extract v32i8:$xj, !add(imm1, 16))),
+ !add(imm2, 16)),
+ (XVEXTRINS_B $xd, $xj, Imm)>;
+ }
+}
+
+foreach imm1 = 0...7 in {
+ foreach imm2 = 0...7 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert v16i16:$xd,
+ (GRLenVT (vector_extract v16i16:$xj, imm1)), imm2),
+ (GRLenVT (vector_extract v16i16:$xj, !add(imm1, 8))),
+ !add(imm2, 8)),
+ (XVEXTRINS_H $xd, $xj, Imm)>;
+ }
+}
+
+defm : PairInsertExtractPatV8<v8i32, GRLenVT>;
+defm : PairInsertExtractPatV8<v8f32, f32>;
+defm : PairInsertExtractPatV4<v4i64, GRLenVT>;
+defm : PairInsertExtractPatV4<v4f64, f64>;
+
// PseudoXVINSGR2VR_{B/H}
def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
(PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
@@ -1593,11 +1651,18 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
(XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
(XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-
-def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
- (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
- (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
+ (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
+ (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
+ (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
+ (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
+ (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
+ (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
// scalar_to_vector
def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
@@ -1790,7 +1855,25 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
def : RegRegStPat<store, XVSTX, LASX256, vt>;
}
+// Bitcast float/double element extracted from vector to integer.
+def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v8f32:$xj, uimm3:$imm))),
+ (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm)>;
+def : Pat<(i64 (bitconvert (f64 (vector_extract v4f64:$xj, uimm2:$imm)))),
+ (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm)>;
+
// Vector extraction with constant index.
+foreach imm = 16...31 in {
+ defvar Imm = !and(imm, 15);
+ def : Pat<(i64 (vector_extract v32i8:$xj, imm)),
+ (VPICKVE2GR_B (EXTRACT_SUBREG (XVPERMI_D v32i8:$xj, 14), sub_128),
+ Imm)>;
+}
+foreach imm = 8...15 in {
+ defvar Imm = !and(imm, 7);
+ def : Pat<(i64 (vector_extract v16i16:$xj, imm)),
+ (VPICKVE2GR_H (EXTRACT_SUBREG (XVPERMI_D v16i16:$xj, 14), sub_128),
+ Imm)>;
+}
def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)),
(VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>;
def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d73d78083ddc..962e7c21431b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1482,6 +1482,28 @@ multiclass VstelmPat<PatFrag StoreOp, ValueType vt, LAInst Inst,
(Inst vt:$vd, BaseAddr:$rj, ImmOpnd:$imm, IdxOpnd:$idx)>;
}
+multiclass InsertExtractPatV4<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...3 in {
+ foreach imm2 = 0...3 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert vecty:$vd,
+ (elemty (vector_extract vecty:$vj, imm1)), imm2),
+ (VEXTRINS_W $vd, $vj, Imm)>;
+ }
+ }
+}
+
+multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...1 in {
+ foreach imm2 = 0...1 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert vecty:$vd,
+ (elemty (vector_extract vecty:$vj, imm1)), imm2),
+ (VEXTRINS_D $vd, $vj, Imm)>;
+ }
+ }
+}
+
let Predicates = [HasExtLSX] in {
// VADD_{B/H/W/D}
@@ -1782,6 +1804,31 @@ defm : PatCCVrVrF<SETUNE, "VFCMP_CUNE">;
defm : PatCCVrVrF<SETO, "VFCMP_COR">;
defm : PatCCVrVrF<SETUO, "VFCMP_CUN">;
+// Insert element extracted from vector into vector.
+// VPICKVE2GR_{B/H/W/D} + VINSGR2VR_{B/H/W/D} -> VEXTRINS_{B/H/W/D}
+foreach imm1 = 0...15 in {
+ foreach imm2 = 0...15 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert v16i8:$vd,
+ (GRLenVT (vector_extract v16i8:$vj, imm1)), imm2),
+ (VEXTRINS_B $vd, $vj, Imm)>;
+ }
+}
+
+foreach imm1 = 0...7 in {
+ foreach imm2 = 0...7 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert v8i16:$vd,
+ (GRLenVT (vector_extract v8i16:$vj, imm1)), imm2),
+ (VEXTRINS_H $vd, $vj, Imm)>;
+ }
+}
+
+defm : InsertExtractPatV4<v4i32, GRLenVT>;
+defm : InsertExtractPatV4<v4f32, f32>;
+defm : InsertExtractPatV2<v2i64, GRLenVT>;
+defm : InsertExtractPatV2<v2f64, f64>;
+
// VINSGR2VR_{B/H/W/D}
def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm),
(VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>;
@@ -1791,7 +1838,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
(VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
(VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
-
+def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$imm),
+ (VINSGR2VR_W $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm),
+ (VINSGR2VR_D $vd, $rj, uimm1:$imm)>;
def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
(VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
@@ -1990,6 +2040,12 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
def : RegRegStPat<store, VSTX, LSX128, vt>;
}
+// Bitcast float/double element extracted from vector to integer.
+def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v4f32:$vj, uimm2:$imm))),
+ (VPICKVE2GR_W v4f32:$vj, uimm2:$imm)>;
+def : Pat<(i64 (bitconvert (f64 (vector_extract v2f64:$vj, uimm1:$imm)))),
+ (VPICKVE2GR_D v2f64:$vj, uimm1:$imm)>;
+
// Vector extraction with constant index.
def : Pat<(i64 (vector_extract v16i8:$vj, uimm4:$imm)),
(VPICKVE2GR_B v16i8:$vj, uimm4:$imm)>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 1b8893029bb3..7b9f1156f910 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -90,7 +90,7 @@ static void reportOutOfRangeError(MCContext &Ctx, SMLoc Loc, unsigned N) {
static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
MCContext &Ctx) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
llvm_unreachable("Unknown fixup kind");
case FK_Data_1:
@@ -157,7 +157,7 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
MCContext &Ctx = getContext();
// Fixup leb128 separately.
- if (Fixup.getTargetKind() == FK_Data_leb128)
+ if (Fixup.getKind() == FK_Data_leb128)
return fixupLeb128(Ctx, Fixup, Data, Value);
// Apply any target-specific value adjustments.
@@ -247,7 +247,7 @@ bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
const MCValue &Target) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
return STI.hasFeature(LoongArch::FeatureRelax);
case FK_Data_1:
@@ -279,23 +279,23 @@ getRelocPairForSize(unsigned Size) {
}
}
-std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCLEBFragment &LF,
+std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCFragment &F,
int64_t &Value) const {
- const MCExpr &Expr = LF.getValue();
- if (LF.isSigned() || !Expr.evaluateKnownAbsolute(Value, *Asm))
+ const MCExpr &Expr = F.getLEBValue();
+ if (F.isLEBSigned() || !Expr.evaluateKnownAbsolute(Value, *Asm))
return std::make_pair(false, false);
- LF.addFixup(MCFixup::create(0, &Expr, FK_Data_leb128));
+ F.setVarFixups({MCFixup::create(0, &Expr, FK_Data_leb128)});
return std::make_pair(true, true);
}
-bool LoongArchAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
+bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F,
bool &WasRelaxed) const {
MCContext &C = getContext();
- int64_t LineDelta = DF.getLineDelta();
- const MCExpr &AddrDelta = DF.getAddrDelta();
+ int64_t LineDelta = F.getDwarfLineDelta();
+ const MCExpr &AddrDelta = F.getDwarfAddrDelta();
SmallVector<MCFixup, 1> Fixups;
- size_t OldSize = DF.getContents().size();
+ size_t OldSize = F.getVarSize();
int64_t Value;
if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
@@ -349,17 +349,16 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
OS << uint8_t(dwarf::DW_LNS_copy);
}
- DF.setContents(Data);
- DF.setFixups(Fixups);
+ F.setVarContents(Data);
+ F.setVarFixups(Fixups);
WasRelaxed = OldSize != Data.size();
return true;
}
-bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
- bool &WasRelaxed) const {
- const MCExpr &AddrDelta = DF.getAddrDelta();
+bool LoongArchAsmBackend::relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const {
+ const MCExpr &AddrDelta = F.getDwarfAddrDelta();
SmallVector<MCFixup, 2> Fixups;
- size_t OldSize = DF.getContents().size();
+ size_t OldSize = F.getVarContents().size();
int64_t Value;
if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
@@ -371,9 +370,9 @@ bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
assert(getContext().getAsmInfo()->getMinInstAlignment() == 1 &&
"expected 1-byte alignment");
if (Value == 0) {
- DF.clearContents();
- DF.clearFixups();
- WasRelaxed = OldSize != DF.getContents().size();
+ F.clearVarContents();
+ F.clearVarFixups();
+ WasRelaxed = OldSize != 0;
return true;
}
@@ -405,8 +404,8 @@ bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
} else {
llvm_unreachable("unsupported CFA encoding");
}
- DF.setContents(Data);
- DF.setFixups(Fixups);
+ F.setVarContents(Data);
+ F.setVarFixups(Fixups);
WasRelaxed = OldSize != Data.size();
return true;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index 4446cadf11e2..b32ba067810c 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -59,11 +59,9 @@ public:
MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
- bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
- bool &WasRelaxed) const override;
- bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
- bool &WasRelaxed) const override;
- std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF,
+ bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
+ bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
+ std::pair<bool, bool> relaxLEB128(MCFragment &F,
int64_t &Value) const override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index faf3cba59a53..fb741afa77e5 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -68,7 +68,7 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup,
break;
}
- unsigned Kind = Fixup.getTargetKind();
+ auto Kind = Fixup.getKind();
if (mc::isRelocation(Fixup.getKind()))
return Kind;
switch (Kind) {
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
index 1fdc1f799fe5..117dd31e7f05 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -30,7 +30,7 @@ protected:
unsigned getRelocType(const MCFixup &Fixup, const MCValue &,
bool IsPCRel) const override {
// Translate fixup kind to ELF relocation type.
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case FK_Data_1: return ELF::R_MSP430_8;
case FK_Data_2: return ELF::R_MSP430_16_BYTE;
case FK_Data_4: return ELF::R_MSP430_32;
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 01e4d17f6236..259b71b37d9a 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2101,7 +2101,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
TOut.getStreamer().emitRelocDirective(
*TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
- RelocJalrExpr, IDLoc, *STI);
+ RelocJalrExpr);
TOut.getStreamer().emitLabel(TmpLabel);
}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 8b73a7bdd4bc..8ccd42ea0abf 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -8,7 +8,6 @@ add_llvm_component_library(LLVMMipsDesc
MipsMCAsmInfo.cpp
MipsMCCodeEmitter.cpp
MipsMCTargetDesc.cpp
- MipsNaClELFStreamer.cpp
MipsOptionRecord.cpp
MipsTargetStreamer.cpp
MipsWinCOFFObjectWriter.cpp
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 25e31941bbb4..ad8f5f0a0974 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -156,7 +156,7 @@ unsigned MipsELFObjectWriter::getRelocType(const MCFixup &Fixup,
const MCValue &Target,
bool IsPCRel) const {
// Determine the type of the relocation.
- unsigned Kind = Fixup.getTargetKind();
+ auto Kind = Fixup.getKind();
switch (Target.getSpecifier()) {
case Mips::S_DTPREL:
case Mips::S_DTPREL_HI:
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
deleted file mode 100644
index 94b2f412c8cd..000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- MipsMCNaCl.h - NaCl-related declarations --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
-#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
-
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/Support/Alignment.h"
-
-namespace llvm {
-
-// NaCl MIPS sandbox's instruction bundle size.
-static const Align MIPS_NACL_BUNDLE_ALIGN = Align(16);
-
-bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
- bool *IsStore = nullptr);
-bool baseRegNeedsLoadStoreMask(MCRegister Reg);
-
-// This function creates an MCELFStreamer for Mips NaCl.
-MCELFStreamer *
-createMipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- std::unique_ptr<MCObjectWriter> OW,
- std::unique_ptr<MCCodeEmitter> Emitter);
-}
-
-#endif
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index ab1eda0f48e1..2cc634154bff 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -16,7 +16,6 @@
#include "MipsELFStreamer.h"
#include "MipsInstPrinter.h"
#include "MipsMCAsmInfo.h"
-#include "MipsMCNaCl.h"
#include "MipsTargetStreamer.h"
#include "TargetInfo/MipsTargetInfo.h"
#include "llvm/DebugInfo/CodeView/CodeView.h"
@@ -199,12 +198,8 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
std::unique_ptr<MCObjectWriter> &&OW,
std::unique_ptr<MCCodeEmitter> &&Emitter) {
MCStreamer *S;
- if (!T.isOSNaCl())
- S = createMipsELFStreamer(Context, std::move(MAB), std::move(OW),
- std::move(Emitter));
- else
- S = createMipsNaClELFStreamer(Context, std::move(MAB), std::move(OW),
- std::move(Emitter));
+ S = createMipsELFStreamer(Context, std::move(MAB), std::move(OW),
+ std::move(Emitter));
return S;
}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
deleted file mode 100644
index 3410726c8e55..000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-//===-- MipsNaClELFStreamer.cpp - ELF Object Output for Mips NaCl ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements MCELFStreamer for Mips NaCl. It emits .o object files
-// as required by NaCl's SFI sandbox. It inserts address-masking instructions
-// before dangerous control-flow and memory access instructions. It inserts
-// address-masking instructions after instructions that change the stack
-// pointer. It ensures that the mask and the dangerous instruction are always
-// emitted in the same bundle. It aligns call + branch delay to the bundle end,
-// so that return address is always aligned to the start of next bundle.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsELFStreamer.h"
-#include "MipsMCNaCl.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mips-mc-nacl"
-
-namespace {
-
-const unsigned IndirectBranchMaskReg = Mips::T6;
-const unsigned LoadStoreStackMaskReg = Mips::T7;
-
-/// Extend the generic MCELFStreamer class so that it can mask dangerous
-/// instructions.
-
-class MipsNaClELFStreamer : public MipsELFStreamer {
-public:
- MipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- std::unique_ptr<MCObjectWriter> OW,
- std::unique_ptr<MCCodeEmitter> Emitter)
- : MipsELFStreamer(Context, std::move(TAB), std::move(OW),
- std::move(Emitter)) {}
-
- ~MipsNaClELFStreamer() override = default;
-
-private:
- // Whether we started the sandboxing sequence for calls. Calls are bundled
- // with branch delays and aligned to the bundle end.
- bool PendingCall = false;
-
- bool isIndirectJump(const MCInst &MI) {
- if (MI.getOpcode() == Mips::JALR) {
- // MIPS32r6/MIPS64r6 doesn't have a JR instruction and uses JALR instead.
- // JALR is an indirect branch if the link register is $0.
- assert(MI.getOperand(0).isReg());
- return MI.getOperand(0).getReg() == Mips::ZERO;
- }
- return MI.getOpcode() == Mips::JR;
- }
-
- bool isStackPointerFirstOperand(const MCInst &MI) {
- return (MI.getNumOperands() > 0 && MI.getOperand(0).isReg()
- && MI.getOperand(0).getReg() == Mips::SP);
- }
-
- bool isCall(const MCInst &MI, bool *IsIndirectCall) {
- unsigned Opcode = MI.getOpcode();
-
- *IsIndirectCall = false;
-
- switch (Opcode) {
- default:
- return false;
-
- case Mips::JAL:
- case Mips::BAL:
- case Mips::BAL_BR:
- case Mips::BLTZAL:
- case Mips::BGEZAL:
- return true;
-
- case Mips::JALR:
- // JALR is only a call if the link register is not $0. Otherwise it's an
- // indirect branch.
- assert(MI.getOperand(0).isReg());
- if (MI.getOperand(0).getReg() == Mips::ZERO)
- return false;
-
- *IsIndirectCall = true;
- return true;
- }
- }
-
- void emitMask(MCRegister AddrReg, unsigned MaskReg,
- const MCSubtargetInfo &STI) {
- MCInst MaskInst;
- MaskInst.setOpcode(Mips::AND);
- MaskInst.addOperand(MCOperand::createReg(AddrReg));
- MaskInst.addOperand(MCOperand::createReg(AddrReg));
- MaskInst.addOperand(MCOperand::createReg(MaskReg));
- MipsELFStreamer::emitInstruction(MaskInst, STI);
- }
-
- // Sandbox indirect branch or return instruction by inserting mask operation
- // before it.
- void sandboxIndirectJump(const MCInst &MI, const MCSubtargetInfo &STI) {
- MCRegister AddrReg = MI.getOperand(0).getReg();
-
- emitBundleLock(false);
- emitMask(AddrReg, IndirectBranchMaskReg, STI);
- MipsELFStreamer::emitInstruction(MI, STI);
- emitBundleUnlock();
- }
-
- // Sandbox memory access or SP change. Insert mask operation before and/or
- // after the instruction.
- void sandboxLoadStoreStackChange(const MCInst &MI, unsigned AddrIdx,
- const MCSubtargetInfo &STI, bool MaskBefore,
- bool MaskAfter) {
- emitBundleLock(false);
- if (MaskBefore) {
- // Sandbox memory access.
- MCRegister BaseReg = MI.getOperand(AddrIdx).getReg();
- emitMask(BaseReg, LoadStoreStackMaskReg, STI);
- }
- MipsELFStreamer::emitInstruction(MI, STI);
- if (MaskAfter) {
- // Sandbox SP change.
- MCRegister SPReg = MI.getOperand(0).getReg();
- assert((Mips::SP == SPReg) && "Unexpected stack-pointer register.");
- emitMask(SPReg, LoadStoreStackMaskReg, STI);
- }
- emitBundleUnlock();
- }
-
-public:
- /// This function is the one used to emit instruction data into the ELF
- /// streamer. We override it to mask dangerous instructions.
- void emitInstruction(const MCInst &Inst,
- const MCSubtargetInfo &STI) override {
- // Sandbox indirect jumps.
- if (isIndirectJump(Inst)) {
- if (PendingCall)
- report_fatal_error("Dangerous instruction in branch delay slot!");
- sandboxIndirectJump(Inst, STI);
- return;
- }
-
- // Sandbox loads, stores and SP changes.
- unsigned AddrIdx = 0;
- bool IsStore = false;
- bool IsMemAccess = isBasePlusOffsetMemoryAccess(Inst.getOpcode(), &AddrIdx,
- &IsStore);
- bool IsSPFirstOperand = isStackPointerFirstOperand(Inst);
- if (IsMemAccess || IsSPFirstOperand) {
- bool MaskBefore = (IsMemAccess
- && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx)
- .getReg()));
- bool MaskAfter = IsSPFirstOperand && !IsStore;
- if (MaskBefore || MaskAfter) {
- if (PendingCall)
- report_fatal_error("Dangerous instruction in branch delay slot!");
- sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter);
- return;
- }
- // fallthrough
- }
-
- // Sandbox calls by aligning call and branch delay to the bundle end.
- // For indirect calls, emit the mask before the call.
- bool IsIndirectCall;
- if (isCall(Inst, &IsIndirectCall)) {
- if (PendingCall)
- report_fatal_error("Dangerous instruction in branch delay slot!");
-
- // Start the sandboxing sequence by emitting call.
- emitBundleLock(true);
- if (IsIndirectCall) {
- MCRegister TargetReg = Inst.getOperand(1).getReg();
- emitMask(TargetReg, IndirectBranchMaskReg, STI);
- }
- MipsELFStreamer::emitInstruction(Inst, STI);
- PendingCall = true;
- return;
- }
- if (PendingCall) {
- // Finish the sandboxing sequence by emitting branch delay.
- MipsELFStreamer::emitInstruction(Inst, STI);
- emitBundleUnlock();
- PendingCall = false;
- return;
- }
-
- // None of the sandboxing applies, just emit the instruction.
- MipsELFStreamer::emitInstruction(Inst, STI);
- }
-};
-
-} // end anonymous namespace
-
-namespace llvm {
-
-bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
- bool *IsStore) {
- if (IsStore)
- *IsStore = false;
-
- switch (Opcode) {
- default:
- return false;
-
- // Load instructions with base address register in position 1.
- case Mips::LB:
- case Mips::LBu:
- case Mips::LH:
- case Mips::LHu:
- case Mips::LW:
- case Mips::LWC1:
- case Mips::LDC1:
- case Mips::LL:
- case Mips::LL_R6:
- case Mips::LWL:
- case Mips::LWR:
- *AddrIdx = 1;
- return true;
-
- // Store instructions with base address register in position 1.
- case Mips::SB:
- case Mips::SH:
- case Mips::SW:
- case Mips::SWC1:
- case Mips::SDC1:
- case Mips::SWL:
- case Mips::SWR:
- *AddrIdx = 1;
- if (IsStore)
- *IsStore = true;
- return true;
-
- // Store instructions with base address register in position 2.
- case Mips::SC:
- case Mips::SC_R6:
- *AddrIdx = 2;
- if (IsStore)
- *IsStore = true;
- return true;
- }
-}
-
-bool baseRegNeedsLoadStoreMask(MCRegister Reg) {
- // The contents of SP and thread pointer register do not require masking.
- return Reg != Mips::SP && Reg != Mips::T8;
-}
-
-MCELFStreamer *
-createMipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- std::unique_ptr<MCObjectWriter> OW,
- std::unique_ptr<MCCodeEmitter> Emitter) {
- MipsNaClELFStreamer *S = new MipsNaClELFStreamer(
- Context, std::move(TAB), std::move(OW), std::move(Emitter));
-
- // Set bundle-alignment as required by the NaCl ABI for the target.
- S->emitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);
-
- return S;
-}
-
-} // end namespace llvm
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index c69fc68ab5af..b89d6890903d 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1033,42 +1033,42 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
}
void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
- MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+ MCFragment *DF = getStreamer().getOrCreateDataFragment();
DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
Mips::fixup_Mips_GPREL32));
DF->appendContents(4, 0);
}
void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
- MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+ MCFragment *DF = getStreamer().getOrCreateDataFragment();
DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
Mips::fixup_Mips_GPREL32));
DF->appendContents(8, 0);
}
void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
- MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+ MCFragment *DF = getStreamer().getOrCreateDataFragment();
DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
Mips::fixup_Mips_DTPREL32));
DF->appendContents(4, 0);
}
void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
- MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+ MCFragment *DF = getStreamer().getOrCreateDataFragment();
DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
Mips::fixup_Mips_DTPREL64));
DF->appendContents(8, 0);
}
void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
- MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+ MCFragment *DF = getStreamer().getOrCreateDataFragment();
DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
Mips::fixup_Mips_TPREL32));
DF->appendContents(4, 0);
}
void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
- MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+ MCFragment *DF = getStreamer().getOrCreateDataFragment();
DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
Mips::fixup_Mips_TPREL64));
DF->appendContents(8, 0);
diff --git a/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp b/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
index b0de8dacf691..4633df5d1b6a 100644
--- a/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
+++ b/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -781,7 +781,7 @@ bool MicroMipsSizeReduce::runOnMachineFunction(MachineFunction &MF) {
Subtarget->hasMips32r6())
return false;
- MipsII = static_cast<const MipsInstrInfo *>(Subtarget->getInstrInfo());
+ MipsII = Subtarget->getInstrInfo();
bool Modified = false;
MachineFunction::iterator I = MF.begin(), E = MF.end();
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 87e06a6d3c08..ca0331006be7 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -16,7 +16,6 @@
#include "MCTargetDesc/MipsBaseInfo.h"
#include "MCTargetDesc/MipsInstPrinter.h"
#include "MCTargetDesc/MipsMCAsmInfo.h"
-#include "MCTargetDesc/MipsMCNaCl.h"
#include "MCTargetDesc/MipsMCTargetDesc.h"
#include "MCTargetDesc/MipsTargetStreamer.h"
#include "Mips.h"
@@ -87,10 +86,6 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
StubsNeeded.insert(I);
MCP = MF.getConstantPool();
- // In NaCl, all indirect jump targets must be aligned to bundle size.
- if (Subtarget->isTargetNaCl())
- NaClAlignIndirectJumpTargets(MF);
-
AsmPrinter::runOnMachineFunction(MF);
emitXRayTable();
@@ -171,7 +166,7 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI,
OutStreamer.emitRelocDirective(
*OffsetExpr,
Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
- CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo());
+ CaleeExpr);
OutStreamer.emitLabel(OffsetLabel);
return;
}
@@ -401,11 +396,6 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
void MipsAsmPrinter::emitFunctionEntryLabel() {
MipsTargetStreamer &TS = getTargetStreamer();
- // NaCl sandboxing requires that indirect call instructions are masked.
- // This means that function entry points should be bundle-aligned.
- if (Subtarget->isTargetNaCl())
- emitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN));
-
if (Subtarget->inMicroMipsMode()) {
TS.emitDirectiveSetMicroMips();
TS.setUsesMicroMips();
@@ -1263,27 +1253,6 @@ void MipsAsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
AsmPrinter::emitDebugValue(Value, Size);
}
-// Align all targets of indirect branches on bundle size. Used only if target
-// is NaCl.
-void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
- // Align all blocks that are jumped to through jump table.
- if (MachineJumpTableInfo *JtInfo = MF.getJumpTableInfo()) {
- const std::vector<MachineJumpTableEntry> &JT = JtInfo->getJumpTables();
- for (const auto &I : JT) {
- const std::vector<MachineBasicBlock *> &MBBs = I.MBBs;
-
- for (MachineBasicBlock *MBB : MBBs)
- MBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
- }
- }
-
- // If basic block address is taken, block can be target of indirect branch.
- for (auto &MBB : MF) {
- if (MBB.hasAddressTaken())
- MBB.setAlignment(MIPS_NACL_BUNDLE_ALIGN);
- }
-}
-
bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
return (Opcode == Mips::LONG_BRANCH_LUi
|| Opcode == Mips::LONG_BRANCH_LUi2Op
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h
index bbaa3b3cef9d..8b2fb32dc552 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.h
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -112,8 +112,6 @@ private:
void EmitFPCallStub(const char *, const Mips16HardFloatInfo::FuncSignature *);
- void NaClAlignIndirectJumpTargets(MachineFunction &MF);
-
bool isLongBranchPseudo(int Opcode) const;
public:
diff --git a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
index 6e897fe87668..3720c936643b 100644
--- a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
+++ b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -74,7 +74,6 @@
#include "MCTargetDesc/MipsABIInfo.h"
#include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsMCNaCl.h"
#include "MCTargetDesc/MipsMCTargetDesc.h"
#include "Mips.h"
#include "MipsInstrInfo.h"
@@ -518,27 +517,19 @@ void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) {
BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
.addReg(Mips::SP)
.addImm(0);
- if (STI->isTargetNaCl())
- // Bundle-align the target of indirect branch JR.
- TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
- // In NaCl, modifying the sp is not allowed in branch delay slot.
// For MIPS32R6, we can skip using a delay slot branch.
bool hasDelaySlot = buildProperJumpMI(BalTgtMBB, Pos, DL);
- if (STI->isTargetNaCl() || !hasDelaySlot) {
+ if (!hasDelaySlot) {
BuildMI(*BalTgtMBB, std::prev(Pos), DL, TII->get(Mips::ADDiu), Mips::SP)
.addReg(Mips::SP)
.addImm(8);
}
if (hasDelaySlot) {
- if (STI->isTargetNaCl()) {
- TII->insertNop(*BalTgtMBB, Pos, DL);
- } else {
- BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
- .addReg(Mips::SP)
- .addImm(8);
- }
+ BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+ .addReg(Mips::SP)
+ .addImm(8);
BalTgtMBB->rbegin()->bundleWithPred();
}
} else {
@@ -899,14 +890,6 @@ bool MipsBranchExpansion::handlePossibleLongBranch() {
(Br->isUnconditionalBranch() && IsPIC))) {
int64_t Offset = computeOffset(&*Br);
- if (STI->isTargetNaCl()) {
- // The offset calculation does not include sandboxing instructions
- // that will be added later in the MC layer. Since at this point we
- // don't know the exact amount of code that "sandboxing" will add, we
- // conservatively estimate that code will not grow more than 100%.
- Offset *= 2;
- }
-
if (ForceLongBranchFirstPass ||
!TII->isBranchOffsetInRange(Br->getOpcode(), Offset)) {
MBBInfos[I].Offset = Offset;
@@ -941,7 +924,7 @@ bool MipsBranchExpansion::runOnMachineFunction(MachineFunction &MF) {
IsPIC = TM.isPositionIndependent();
ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
STI = &MF.getSubtarget<MipsSubtarget>();
- TII = static_cast<const MipsInstrInfo *>(STI->getInstrInfo());
+ TII = STI->getInstrInfo();
if (IsPIC && ABI.IsO32() &&
MF.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 3c60114f507b..39e184a6303a 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -267,15 +267,8 @@ def CC_Mips_FastCC : CallingConv<[
// Integer arguments are passed in integer registers. All scratch registers,
// except for AT, V0 and T9, are available to be used as argument registers.
- CCIfType<[i32], CCIfSubtargetNot<"isTargetNaCl()",
- CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>>,
-
- // In NaCl, T6, T7 and T8 are reserved and not available as argument
- // registers for fastcc. T6 contains the mask for sandboxing control flow
- // (indirect jumps and calls). T7 contains the mask for sandboxing memory
- // accesses (loads and stores). T8 contains the thread pointer.
- CCIfType<[i32], CCIfSubtarget<"isTargetNaCl()",
- CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, V1]>>>,
+ CCIfType<[i32],
+ CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>,
// f32 arguments are passed in single-precision floating pointer registers.
CCIfType<[f32], CCIfSubtarget<"useOddSPReg()",
diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index b13394a607f6..dfbbcbe60219 100644
--- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -10,7 +10,6 @@
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/MipsMCNaCl.h"
#include "Mips.h"
#include "MipsInstrInfo.h"
#include "MipsSubtarget.h"
@@ -727,18 +726,6 @@ bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin,
continue;
const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
- if (STI.isTargetNaCl()) {
- // In NaCl, instructions that must be masked are forbidden in delay slots.
- // We only check for loads, stores and SP changes. Calls, returns and
- // branches are not checked because non-NaCl targets never put them in
- // delay slots.
- unsigned AddrIdx;
- if ((isBasePlusOffsetMemoryAccess(CurrI->getOpcode(), &AddrIdx) &&
- baseRegNeedsLoadStoreMask(CurrI->getOperand(AddrIdx).getReg())) ||
- CurrI->modifiesRegister(Mips::SP, STI.getRegisterInfo()))
- continue;
- }
-
bool InMicroMipsMode = STI.inMicroMipsMode();
const MipsInstrInfo *TII = STI.getInstrInfo();
unsigned Opcode = (*Slot).getOpcode();
diff --git a/llvm/lib/Target/Mips/MipsInstrFPU.td b/llvm/lib/Target/Mips/MipsInstrFPU.td
index 14590ddacfcb..4ca329d21498 100644
--- a/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -622,15 +622,13 @@ let AdditionalPredicates = [NotInMicroMips] in {
// Indexed loads and stores.
// Base register + offset register addressing mode (indicated by "x" in the
-// instruction mnemonic) is disallowed under NaCl.
-let AdditionalPredicates = [IsNotNaCl] in {
- def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
- INSN_MIPS4_32R2_NOT_32R6_64R6;
- def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
- INSN_MIPS4_32R2_NOT_32R6_64R6;
-}
+// instruction mnemonic).
+def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
+ INSN_MIPS4_32R2_NOT_32R6_64R6;
+def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
+ INSN_MIPS4_32R2_NOT_32R6_64R6;
-let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in {
+let AdditionalPredicates = [NotInMicroMips] in {
def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
@@ -646,14 +644,14 @@ let DecoderNamespace="MipsFP64" in {
// Load/store doubleword indexed unaligned.
// FIXME: This instruction should not be defined for FGR_32.
-let AdditionalPredicates = [IsNotNaCl, NotInMicroMips] in {
+let AdditionalPredicates = [NotInMicroMips] in {
def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
}
-let AdditionalPredicates = [IsNotNaCl, NotInMicroMips],
+let AdditionalPredicates = [NotInMicroMips],
DecoderNamespace="MipsFP64" in {
def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index b6125b972717..a124e84e9ca5 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -236,7 +236,6 @@ def NotInMicroMips : Predicate<"!Subtarget->inMicroMipsMode()">,
AssemblerPredicate<(all_of (not FeatureMicroMips))>;
def IsLE : Predicate<"Subtarget->isLittle()">;
def IsBE : Predicate<"!Subtarget->isLittle()">;
-def IsNotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
def UseTCCInDIV : AssemblerPredicate<(all_of FeatureUseTCCInDIV)>;
def HasEVA : Predicate<"Subtarget->hasEVA()">,
AssemblerPredicate<(all_of FeatureEVA)>;
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index ae4b2377ad21..539288e8da59 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -162,13 +162,6 @@ getReservedRegs(const MachineFunction &MF) const {
for (MCPhysReg R : ReservedGPR32)
Reserved.set(R);
- // Reserve registers for the NaCl sandbox.
- if (Subtarget.isTargetNaCl()) {
- Reserved.set(Mips::T6); // Reserved for control flow mask.
- Reserved.set(Mips::T7); // Reserved for memory access mask.
- Reserved.set(Mips::T8); // Reserved for thread pointer.
- }
-
for (MCPhysReg R : ReservedGPR64)
Reserved.set(R);
diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index d775f5a16bcd..f08704a7e799 100644
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -403,8 +403,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
const MipsSEInstrInfo &TII =
*static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
- const MipsRegisterInfo &RegInfo =
- *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
+ const MipsRegisterInfo &RegInfo = *STI.getRegisterInfo();
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc dl;
@@ -658,8 +657,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
const MipsSEInstrInfo &TII =
*static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
- const MipsRegisterInfo &RegInfo =
- *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
+ const MipsRegisterInfo &RegInfo = *STI.getRegisterInfo();
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
MipsABIInfo ABI = STI.getABI();
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.h b/llvm/lib/Target/Mips/MipsSubtarget.h
index bb026f565512..52f892a160c3 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -355,7 +355,6 @@ public:
bool os16() const { return Os16; }
- bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
bool isXRaySupported() const override { return true; }
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 443db4391a52..8eec91562ecf 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -268,8 +268,8 @@ void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
llvm_unreachable("Empty Modifier");
}
-void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
- raw_ostream &O, StringRef Modifier) {
+void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum,
+ raw_ostream &O, StringRef Modifier) {
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int)MO.getImm();
if (Modifier == "sem") {
@@ -286,22 +286,24 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
case NVPTX::Ordering::Release:
O << ".release";
return;
+ case NVPTX::Ordering::AcquireRelease:
+ O << ".acq_rel";
+ return;
+ case NVPTX::Ordering::SequentiallyConsistent:
+ O << ".seq_cst";
+ return;
case NVPTX::Ordering::Volatile:
O << ".volatile";
return;
case NVPTX::Ordering::RelaxedMMIO:
O << ".mmio.relaxed";
return;
- default:
- report_fatal_error(formatv(
- "NVPTX LdStCode Printer does not support \"{}\" sem modifier. "
- "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.",
- OrderingToString(Ordering)));
}
} else if (Modifier == "scope") {
auto S = NVPTX::Scope(Imm);
switch (S) {
case NVPTX::Scope::Thread:
+ case NVPTX::Scope::DefaultDevice:
return;
case NVPTX::Scope::System:
O << ".sys";
@@ -316,9 +318,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
O << ".gpu";
return;
}
- report_fatal_error(
- formatv("NVPTX LdStCode Printer does not support \"{}\" sco modifier.",
- ScopeToString(S)));
+ report_fatal_error(formatv(
+ "NVPTX AtomicCode Printer does not support \"{}\" scope modifier.",
+ ScopeToString(S)));
} else if (Modifier == "addsp") {
auto A = NVPTX::AddressSpace(Imm);
switch (A) {
@@ -334,7 +336,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
return;
}
report_fatal_error(formatv(
- "NVPTX LdStCode Printer does not support \"{}\" addsp modifier.",
+ "NVPTX AtomicCode Printer does not support \"{}\" addsp modifier.",
AddressSpaceToString(A)));
} else if (Modifier == "sign") {
switch (Imm) {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index 193c436939f6..c3ff3469150e 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -40,8 +40,8 @@ public:
StringRef Modifier = {});
void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
StringRef Modifier = {});
- void printLdStCode(const MCInst *MI, int OpNum, raw_ostream &O,
- StringRef Modifier = {});
+ void printAtomicCode(const MCInst *MI, int OpNum, raw_ostream &O,
+ StringRef Modifier = {});
void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
StringRef Modifier = {});
void printMemOperand(const MCInst *MI, int OpNum, raw_ostream &O,
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 15997bc3878d..77a0e03d4075 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -164,7 +164,6 @@ enum Ordering : OrderingUnderlyingType {
(OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent,
Volatile = SequentiallyConsistent + 1,
RelaxedMMIO = Volatile + 1,
- LASTORDERING = RelaxedMMIO
};
using ScopeUnderlyingType = unsigned int;
@@ -174,7 +173,8 @@ enum Scope : ScopeUnderlyingType {
Cluster = 2,
Device = 3,
System = 4,
- LASTSCOPE = System
+ DefaultDevice = 5, // For SM < 70: denotes PTX op implicit/default .gpu scope
+ LASTSCOPE = DefaultDevice
};
using AddressSpaceUnderlyingType = unsigned int;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ae73d8da79f8..65e7c5677454 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -494,7 +494,7 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
return true;
}
-static std::optional<unsigned> convertAS(unsigned AS) {
+static std::optional<NVPTX::AddressSpace> convertAS(unsigned AS) {
switch (AS) {
case llvm::ADDRESS_SPACE_LOCAL:
return NVPTX::AddressSpace::Local;
@@ -515,11 +515,42 @@ static std::optional<unsigned> convertAS(unsigned AS) {
}
}
-static unsigned int getCodeAddrSpace(const MemSDNode *N) {
+NVPTX::AddressSpace NVPTXDAGToDAGISel::getAddrSpace(const MemSDNode *N) {
return convertAS(N->getMemOperand()->getAddrSpace())
.value_or(NVPTX::AddressSpace::Generic);
}
+NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const {
+ // No "sem" orderings for SM/PTX versions which do not support memory ordering
+ if (!Subtarget->hasMemoryOrdering())
+ return NVPTX::Ordering::NotAtomic;
+ auto Ordering = N->getMergedOrdering();
+ switch (Ordering) {
+ case AtomicOrdering::NotAtomic:
+ return NVPTX::Ordering::NotAtomic;
+ case AtomicOrdering::Unordered:
+ case AtomicOrdering::Monotonic:
+ return NVPTX::Ordering::Relaxed;
+ case AtomicOrdering::Acquire:
+ return NVPTX::Ordering::Acquire;
+ case AtomicOrdering::Release:
+ return NVPTX::Ordering::Release;
+ case AtomicOrdering::AcquireRelease:
+ return NVPTX::Ordering::AcquireRelease;
+ case AtomicOrdering::SequentiallyConsistent:
+ return NVPTX::Ordering::SequentiallyConsistent;
+ }
+ llvm_unreachable("Invalid atomic ordering");
+}
+
+NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const {
+ // No "scope" modifier for SM/PTX versions which do not support scoped atomics
+ // Functionally, these atomics are at device scope
+ if (!Subtarget->hasAtomScope())
+ return NVPTX::Scope::DefaultDevice;
+ return Scopes[N->getSyncScopeID()];
+}
+
namespace {
struct OperationOrderings {
@@ -532,7 +563,7 @@ struct OperationOrderings {
static OperationOrderings
getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
AtomicOrdering Ordering = N->getSuccessOrdering();
- auto CodeAddrSpace = getCodeAddrSpace(N);
+ auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N);
bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
@@ -756,7 +787,7 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
}
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
- unsigned CodeAddrSpace) {
+ NVPTX::AddressSpace CodeAddrSpace) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
// space.
return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
@@ -788,6 +819,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
: NVPTX::INT_MEMBAR_GL;
case NVPTX::Scope::Thread:
+ case NVPTX::Scope::DefaultDevice:
report_fatal_error(
formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
ScopeToString(S)));
@@ -807,6 +839,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
: NVPTX::INT_MEMBAR_GL;
case NVPTX::Scope::Thread:
+ case NVPTX::Scope::DefaultDevice:
report_fatal_error(
formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
ScopeToString(S)));
@@ -826,6 +859,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
: NVPTX::INT_MEMBAR_GL;
case NVPTX::Scope::Thread:
+ case NVPTX::Scope::DefaultDevice:
report_fatal_error(
formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
ScopeToString(S)));
@@ -846,6 +880,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
: NVPTX::INT_MEMBAR_GL;
case NVPTX::Scope::Thread:
+ case NVPTX::Scope::DefaultDevice:
report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
ScopeToString(S)));
}
@@ -1025,7 +1060,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
const MVT LoadedVT = LoadedEVT.getSimpleVT();
// Address Space Setting
- const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+ const auto CodeAddrSpace = getAddrSpace(LD);
if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
return tryLDG(LD);
@@ -1097,7 +1132,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
const MVT MemVT = MemEVT.getSimpleVT();
// Address Space Setting
- const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+ const auto CodeAddrSpace = getAddrSpace(LD);
if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
return tryLDG(LD);
@@ -1313,7 +1348,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
return false;
// Address Space Setting
- const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
+ const auto CodeAddrSpace = getAddrSpace(ST);
SDLoc DL(ST);
SDValue Chain = ST->getChain();
@@ -1363,7 +1398,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
assert(StoreVT.isSimple() && "Store value is not simple");
// Address Space Setting
- const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
+ const auto CodeAddrSpace = getAddrSpace(ST);
if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
report_fatal_error("Cannot store to pointer that points to constant "
"memory space");
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 88e5328ff69c..b99b4ef2d307 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -100,6 +100,8 @@ private:
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
+ NVPTX::Ordering getMemOrder(const MemSDNode *N) const;
+ NVPTX::Scope getAtomicScope(const MemSDNode *N) const;
bool SelectADDR(SDValue Addr, SDValue &Base, SDValue &Offset);
SDValue getPTXCmpMode(const CondCodeSDNode &CondCode);
@@ -114,6 +116,9 @@ private:
std::pair<NVPTX::Ordering, NVPTX::Scope>
insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N);
NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const;
+
+public:
+ static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N);
};
class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3d010e04824c..7aa06f9079b0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -57,6 +57,7 @@
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/NVPTXAddrSpace.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -1047,9 +1048,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
MVT::v32i32, MVT::v64i32, MVT::v128i32},
Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
- // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i128, Custom);
+ // Enable custom lowering for the following:
+ // * MVT::i128 - clusterlaunchcontrol
+ // * MVT::i32 - prmt
+ // * MVT::Other - internal.addrspace.wrap
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
+ Custom);
}
const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -1087,7 +1091,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::StoreV8)
MAKE_CASE(NVPTXISD::FSHL_CLAMP)
MAKE_CASE(NVPTXISD::FSHR_CLAMP)
- MAKE_CASE(NVPTXISD::BFE)
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
MAKE_CASE(NVPTXISD::FCOPYSIGN)
@@ -2060,6 +2063,19 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
+static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL,
+ SelectionDAG &DAG,
+ unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+ return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
+ {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
+}
+
+static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,
+ SelectionDAG &DAG,
+ unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+ return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
+}
+
SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
@@ -2111,15 +2127,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
}
- return DAG.getNode(
- NVPTXISD::PRMT, DL, MVT::v4i8,
- {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32),
- DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+ return getPRMT(L, R, SelectionValue, DL, DAG);
};
auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
- return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
+ return DAG.getBitcast(VT, PRMT3210);
}
// Get value or the Nth operand as an APInt(32). Undef values treated as 0.
@@ -2173,14 +2186,17 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
EVT VectorVT = Vector.getValueType();
if (VectorVT == MVT::v4i8) {
- SDValue BFE =
- DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
- {Vector,
- DAG.getNode(ISD::MUL, DL, MVT::i32,
- DAG.getZExtOrTrunc(Index, DL, MVT::i32),
- DAG.getConstant(8, DL, MVT::i32)),
- DAG.getConstant(8, DL, MVT::i32)});
- return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
+ SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
+ DAG.getZExtOrTrunc(Index, DL, MVT::i32),
+ DAG.getConstant(0x7770, DL, MVT::i32));
+ SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
+ DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
+ SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
+ SDNodeFlags Flags;
+ Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
+ Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
+ Ext->setFlags(Flags);
+ return Ext;
}
// Constant index will be matched by tablegen.
@@ -2242,9 +2258,9 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
}
SDLoc DL(Op);
- return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
- DAG.getConstant(Selector, DL, MVT::i32),
- DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
+ SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
+ DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
+ return DAG.getBitcast(Op.getValueType(), PRMT);
}
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
@@ -2729,10 +2745,46 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op,
{TryCancelResponse0, TryCancelResponse1});
}
+static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) {
+ const unsigned Mode = [&]() {
+ switch (Op->getConstantOperandVal(0)) {
+ case Intrinsic::nvvm_prmt:
+ return NVPTX::PTXPrmtMode::NONE;
+ case Intrinsic::nvvm_prmt_b4e:
+ return NVPTX::PTXPrmtMode::B4E;
+ case Intrinsic::nvvm_prmt_ecl:
+ return NVPTX::PTXPrmtMode::ECL;
+ case Intrinsic::nvvm_prmt_ecr:
+ return NVPTX::PTXPrmtMode::ECR;
+ case Intrinsic::nvvm_prmt_f4e:
+ return NVPTX::PTXPrmtMode::F4E;
+ case Intrinsic::nvvm_prmt_rc16:
+ return NVPTX::PTXPrmtMode::RC16;
+ case Intrinsic::nvvm_prmt_rc8:
+ return NVPTX::PTXPrmtMode::RC8;
+ default:
+ llvm_unreachable("unsupported/unhandled intrinsic");
+ }
+ }();
+ SDLoc DL(Op);
+ SDValue A = Op->getOperand(1);
+ SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
+ : DAG.getConstant(0, DL, MVT::i32);
+ SDValue Selector = (Op->op_end() - 1)->get();
+ return getPRMT(A, B, Selector, DL, DAG, Mode);
+}
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {
switch (Op->getConstantOperandVal(0)) {
default:
return Op;
+ case Intrinsic::nvvm_prmt:
+ case Intrinsic::nvvm_prmt_b4e:
+ case Intrinsic::nvvm_prmt_ecl:
+ case Intrinsic::nvvm_prmt_ecr:
+ case Intrinsic::nvvm_prmt_f4e:
+ case Intrinsic::nvvm_prmt_rc16:
+ case Intrinsic::nvvm_prmt_rc8:
+ return lowerPrmtIntrinsic(Op, DAG);
case Intrinsic::nvvm_internal_addrspace_wrap:
return Op.getOperand(1);
case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
@@ -5271,31 +5323,6 @@ static SDValue PerformANDCombine(SDNode *N,
SDValue AExt;
- // Convert BFE-> truncate i16 -> and 255
- // To just BFE-> truncate i16, as the value already has all the bits in the
- // right places.
- if (Val.getOpcode() == ISD::TRUNCATE) {
- SDValue BFE = Val.getOperand(0);
- if (BFE.getOpcode() != NVPTXISD::BFE)
- return SDValue();
-
- ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
- if (!BFEBits)
- return SDValue();
- uint64_t BFEBitsVal = BFEBits->getZExtValue();
-
- ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
- if (!MaskCnst) {
- // Not an AND with a constant
- return SDValue();
- }
- uint64_t MaskVal = MaskCnst->getZExtValue();
-
- if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
- return SDValue();
- // If we get here, the AND is unnecessary. Just replace it with the trunc
- DCI.CombineTo(N, Val, false);
- }
// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
if (Val.getOpcode() == ISD::ANY_EXTEND) {
AExt = Val;
@@ -5800,11 +5827,10 @@ PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
auto &DAG = DCI.DAG;
- auto PRMT = DAG.getNode(
- NVPTXISD::PRMT, DL, MVT::v4i8,
- {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
- DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
- return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
+ auto PRMT =
+ getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
+ (Op1Bytes << 8) | Op0Bytes, DL, DAG);
+ return DAG.getBitcast(VT, PRMT);
}
static SDValue combineADDRSPACECAST(SDNode *N,
@@ -5822,47 +5848,120 @@ static SDValue combineADDRSPACECAST(SDNode *N,
return SDValue();
}
+// Given a constant selector value and a prmt mode, return the selector value
+// normalized to the generic prmt mode. See the PTX ISA documentation for more
+// details:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
+ if (Mode == NVPTX::PTXPrmtMode::NONE)
+ return Selector;
+
+ const unsigned V = Selector.trunc(2).getZExtValue();
+
+ const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
+ unsigned S3) {
+ return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
+ };
+
+ switch (Mode) {
+ case NVPTX::PTXPrmtMode::F4E:
+ return GetSelector(V, V + 1, V + 2, V + 3);
+ case NVPTX::PTXPrmtMode::B4E:
+ return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
+ case NVPTX::PTXPrmtMode::RC8:
+ return GetSelector(V, V, V, V);
+ case NVPTX::PTXPrmtMode::ECL:
+ return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
+ case NVPTX::PTXPrmtMode::ECR:
+ return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
+ case NVPTX::PTXPrmtMode::RC16: {
+ unsigned V1 = (V & 1) << 1;
+ return GetSelector(V1, V1 + 1, V1, V1 + 1);
+ }
+ default:
+ llvm_unreachable("Invalid PRMT mode");
+ }
+}
+
+static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
+ // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+ APInt BitField = B.concat(A);
+ APInt SelectorVal = getPRMTSelector(Selector, Mode);
+ APInt Result(32, 0);
+ for (unsigned I : llvm::seq(4U)) {
+ APInt Sel = SelectorVal.extractBits(4, I * 4);
+ unsigned Idx = Sel.getLoBits(3).getZExtValue();
+ unsigned Sign = Sel.getHiBits(1).getZExtValue();
+ APInt Byte = BitField.extractBits(8, Idx * 8);
+ if (Sign)
+ Byte = Byte.ashr(8);
+ Result.insertBits(Byte, I * 8);
+ }
+ return Result;
+}
+
+static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ // Constant fold PRMT
+ if (isa<ConstantSDNode>(N->getOperand(0)) &&
+ isa<ConstantSDNode>(N->getOperand(1)) &&
+ isa<ConstantSDNode>(N->getOperand(2)))
+ return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
+ N->getConstantOperandAPInt(1),
+ N->getConstantOperandAPInt(2),
+ N->getConstantOperandVal(3)),
+ SDLoc(N), N->getValueType(0));
+
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
switch (N->getOpcode()) {
- default: break;
- case ISD::ADD:
- return PerformADDCombine(N, DCI, OptLevel);
- case ISD::FADD:
- return PerformFADDCombine(N, DCI, OptLevel);
- case ISD::MUL:
- return PerformMULCombine(N, DCI, OptLevel);
- case ISD::SHL:
- return PerformSHLCombine(N, DCI, OptLevel);
- case ISD::AND:
- return PerformANDCombine(N, DCI);
- case ISD::UREM:
- case ISD::SREM:
- return PerformREMCombine(N, DCI, OptLevel);
- case ISD::SETCC:
- return PerformSETCCCombine(N, DCI, STI.getSmVersion());
- case ISD::LOAD:
- case NVPTXISD::LoadParamV2:
- case NVPTXISD::LoadV2:
- case NVPTXISD::LoadV4:
- return combineUnpackingMovIntoLoad(N, DCI);
- case NVPTXISD::StoreParam:
- case NVPTXISD::StoreParamV2:
- case NVPTXISD::StoreParamV4:
- return PerformStoreParamCombine(N, DCI);
- case ISD::STORE:
- case NVPTXISD::StoreV2:
- case NVPTXISD::StoreV4:
- return PerformStoreCombine(N, DCI);
- case ISD::EXTRACT_VECTOR_ELT:
- return PerformEXTRACTCombine(N, DCI);
- case ISD::VSELECT:
- return PerformVSELECTCombine(N, DCI);
- case ISD::BUILD_VECTOR:
- return PerformBUILD_VECTORCombine(N, DCI);
- case ISD::ADDRSPACECAST:
- return combineADDRSPACECAST(N, DCI);
+ default:
+ break;
+ case ISD::ADD:
+ return PerformADDCombine(N, DCI, OptLevel);
+ case ISD::ADDRSPACECAST:
+ return combineADDRSPACECAST(N, DCI);
+ case ISD::AND:
+ return PerformANDCombine(N, DCI);
+ case ISD::BUILD_VECTOR:
+ return PerformBUILD_VECTORCombine(N, DCI);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return PerformEXTRACTCombine(N, DCI);
+ case ISD::FADD:
+ return PerformFADDCombine(N, DCI, OptLevel);
+ case ISD::LOAD:
+ case NVPTXISD::LoadParamV2:
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LoadV4:
+ return combineUnpackingMovIntoLoad(N, DCI);
+ case ISD::MUL:
+ return PerformMULCombine(N, DCI, OptLevel);
+ case NVPTXISD::PRMT:
+ return combinePRMT(N, DCI, OptLevel);
+ case ISD::SETCC:
+ return PerformSETCCCombine(N, DCI, STI.getSmVersion());
+ case ISD::SHL:
+ return PerformSHLCombine(N, DCI, OptLevel);
+ case ISD::SREM:
+ case ISD::UREM:
+ return PerformREMCombine(N, DCI, OptLevel);
+ case NVPTXISD::StoreParam:
+ case NVPTXISD::StoreParamV2:
+ case NVPTXISD::StoreParamV4:
+ return PerformStoreParamCombine(N, DCI);
+ case ISD::STORE:
+ case NVPTXISD::StoreV2:
+ case NVPTXISD::StoreV4:
+ return PerformStoreCombine(N, DCI);
+ case ISD::VSELECT:
+ return PerformVSELECTCombine(N, DCI);
}
return SDValue();
}
@@ -6340,10 +6439,12 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Specialize for cmpxchg
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
+ SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
if (isReleaseOrStronger(Ord))
- return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
+ ? Ord
+ : AtomicOrdering::Release,
+ SSID);
return nullptr;
}
@@ -6355,15 +6456,15 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
+ auto *CI = cast<AtomicCmpXchgInst>(Inst);
auto CASWidth =
- cast<IntegerType>(
- dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())
- ->getBitWidth();
+ cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
+ SyncScope::ID SSID = CI->getSyncScopeID();
// Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
@@ -6402,3 +6503,45 @@ MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
return getDataSection();
}
+
+static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
+ const SelectionDAG &DAG, unsigned Depth) {
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+ ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ unsigned Mode = Op.getConstantOperandVal(3);
+
+ if (!Selector)
+ return;
+
+ KnownBits AKnown = DAG.computeKnownBits(A, Depth);
+ KnownBits BKnown = DAG.computeKnownBits(B, Depth);
+
+ // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+ KnownBits BitField = BKnown.concat(AKnown);
+
+ APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
+ for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) {
+ APInt Sel = SelectorVal.extractBits(4, I * 4);
+ unsigned Idx = Sel.getLoBits(3).getZExtValue();
+ unsigned Sign = Sel.getHiBits(1).getZExtValue();
+ KnownBits Byte = BitField.extractBits(8, Idx * 8);
+ if (Sign)
+ Byte = KnownBits::ashr(Byte, 8);
+ Known.insertBits(Byte, I * 8);
+ }
+}
+
+void NVPTXTargetLowering::computeKnownBitsForTargetNode(
+ const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+ const SelectionDAG &DAG, unsigned Depth) const {
+ Known.resetAll();
+
+ switch (Op.getOpcode()) {
+ case NVPTXISD::PRMT:
+ computeKnownBitsForPRMT(Op, Known, DAG, Depth);
+ break;
+ default:
+ break;
+ }
+} \ No newline at end of file
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 2477e1fb6159..bc3548c0272b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -50,7 +50,6 @@ enum NodeType : unsigned {
MUL_WIDE_UNSIGNED,
SETP_F16X2,
SETP_BF16X2,
- BFE,
BFI,
PRMT,
@@ -272,6 +271,11 @@ public:
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
+ void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
private:
const NVPTXSubtarget &STI; // cache the subtarget here
mutable unsigned GlobalUniqueCallSite;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index db6b411509e9..a5bb83dfadb8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1372,11 +1372,6 @@ def BREV64 :
// restriction in PTX?
//
// dest and src may be int32 or int64, but start and end are always int32.
-def SDTBFE :
- SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
- SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
-def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
-
def SDTBFI :
SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
@@ -1387,22 +1382,13 @@ def SDTPRMT :
SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
-multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
+multiclass BFE<string Instr, RegisterClass RC> {
def rrr
- : BasicNVPTXInst<(outs RC:$d),
- (ins RC:$a, B32:$b, B32:$c),
- Instr,
- [(set T:$d, (bfe T:$a, i32:$b, i32:$c))]>;
+ : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, B32:$b, B32:$c), Instr>;
def rri
- : BasicNVPTXInst<(outs RC:$d),
- (ins RC:$a, B32:$b, i32imm:$c),
- Instr,
- [(set T:$d, (bfe T:$a, i32:$b, imm:$c))]>;
+ : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, B32:$b, i32imm:$c), Instr>;
def rii
- : BasicNVPTXInst<(outs RC:$d),
- (ins RC:$a, i32imm:$b, i32imm:$c),
- Instr,
- [(set T:$d, (bfe T:$a, imm:$b, imm:$c))]>;
+ : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, i32imm:$b, i32imm:$c), Instr>;
}
multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
@@ -1447,10 +1433,10 @@ let hasSideEffects = false in {
// the same patterns, so the first one wins. Having unsigned byte extraction
// has the benefit of always having zero in unused bits, which makes some
// optimizations easier (e.g. no need to mask them).
- defm BFE_U32 : BFE<"bfe.u32", i32, B32>;
- defm BFE_S32 : BFE<"bfe.s32", i32, B32>;
- defm BFE_U64 : BFE<"bfe.u64", i64, B64>;
- defm BFE_S64 : BFE<"bfe.s64", i64, B64>;
+ defm BFE_U32 : BFE<"bfe.u32", B32>;
+ defm BFE_S32 : BFE<"bfe.s32", B32>;
+ defm BFE_U64 : BFE<"bfe.u64", B64>;
+ defm BFE_S64 : BFE<"bfe.s64", B64>;
defm BFI_B32 : BFI<"bfi.b32", i32, B32, i32imm>;
defm BFI_B64 : BFI<"bfi.b64", i64, B64, i64imm>;
@@ -1467,18 +1453,33 @@ let hasSideEffects = false in {
(ins PrmtMode:$mode),
"prmt.b32$mode",
[(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>;
+ def PRMT_B32rir
+ : BasicFlagsNVPTXInst<(outs B32:$d),
+ (ins B32:$a, i32imm:$b, B32:$c),
+ (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
def PRMT_B32rii
: BasicFlagsNVPTXInst<(outs B32:$d),
(ins B32:$a, i32imm:$b, Hexu32imm:$c),
(ins PrmtMode:$mode),
"prmt.b32$mode",
[(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>;
- def PRMT_B32rir
+ def PRMT_B32irr
: BasicFlagsNVPTXInst<(outs B32:$d),
- (ins B32:$a, i32imm:$b, B32:$c),
- (ins PrmtMode:$mode),
+ (ins i32imm:$a, B32:$b, B32:$c), (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt imm:$a, i32:$b, i32:$c, imm:$mode))]>;
+ def PRMT_B32iri
+ : BasicFlagsNVPTXInst<(outs B32:$d),
+ (ins i32imm:$a, B32:$b, Hexu32imm:$c), (ins PrmtMode:$mode),
"prmt.b32$mode",
- [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
+ [(set i32:$d, (prmt imm:$a, i32:$b, imm:$c, imm:$mode))]>;
+ def PRMT_B32iir
+ : BasicFlagsNVPTXInst<(outs B32:$d),
+ (ins i32imm:$a, i32imm:$b, B32:$c), (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt imm:$a, imm:$b, i32:$c, imm:$mode))]>;
}
@@ -1487,19 +1488,26 @@ def : Pat<(fshr i32:$hi, i32:$lo, (shl i32:$amt, (i32 3))),
(PRMT_B32rrr $lo, $hi, $amt, PrmtF4E)>;
+def byte_extract_prmt : ImmLeaf<i32, [{
+ return (Imm == 0x7770) || (Imm == 0x7771) || (Imm == 0x7772) || (Imm == 0x7773);
+}]>;
+
+def to_sign_extend_selector : SDNodeXForm<imm, [{
+ const APInt &V = N->getAPIntValue();
+ const APInt B = V.trunc(4);
+ const APInt BSext = B | 8;
+ const APInt R = BSext.concat(BSext).concat(BSext).concat(B).zext(32);
+ return CurDAG->getTargetConstant(R, SDLoc(N), MVT::i32);
+}]>;
+
+
// byte extraction + signed/unsigned extension to i32.
-def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),
- (BFE_S32rri $s, $o, 8)>;
-def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)),
- (BFE_S32rii $s, imm:$o, 8)>;
-def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)),
- (BFE_U32rri $s, $o, 8)>;
-def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)),
- (BFE_U32rii $s, imm:$o, 8)>;
+def : Pat<(i32 (sext_inreg (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtNONE), i8)),
+ (PRMT_B32rii $s, 0, (to_sign_extend_selector $sel), PrmtNONE)>;
// byte extraction + signed extension to i16
-def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)),
- (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
+def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtNONE)), i8)),
+ (CVT_u16_u32 (PRMT_B32rii $s, 0, (to_sign_extend_selector $sel), PrmtNONE), CvtNONE)>;
// Byte extraction via shift/trunc/sext
@@ -1615,8 +1623,8 @@ def ADDR : Operand<pAny> {
let MIOperandInfo = (ops ADDR_base, i32imm);
}
-def LdStCode : Operand<i32> {
- let PrintMethod = "printLdStCode";
+def AtomicCode : Operand<i32> {
+ let PrintMethod = "printAtomicCode";
}
def MmaCode : Operand<i32> {
@@ -1709,28 +1717,36 @@ def cond_not_signed : PatLeaf<(cond), [{
return !isSignedIntSetCC(N->get());
}]>;
-// comparisons of i8 extracted with BFE as i32
-// It's faster to do comparison directly on i32 extracted by BFE,
+// comparisons of i8 extracted with PRMT as i32
+// It's faster to do comparison directly on i32 extracted by PRMT,
// instead of the long conversion and sign extending.
-def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (bfe B32:$a, B32:$oa, 8))), i8)),
- (i16 (sext_inreg (i16 (trunc (bfe B32:$b, B32:$ob, 8))), i8)),
+def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)),
+ (i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)),
cond_signed:$cc),
- (SETP_i32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), (cond2cc $cc))>;
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+ (cond2cc $cc))>;
-def: Pat<(setcc (i16 (sext_inreg (trunc (bfe B32:$a, imm:$oa, 8)), i8)),
- (i16 (sext_inreg (trunc (bfe B32:$b, imm:$ob, 8)), i8)),
+def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
+ (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
cond_signed:$cc),
- (SETP_i32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), (cond2cc $cc))>;
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+ (cond2cc $cc))>;
-def: Pat<(setcc (i16 (and (trunc (bfe B32:$a, B32:$oa, 8)), 255)),
- (i16 (and (trunc (bfe B32:$b, B32:$ob, 8)), 255)),
+def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
+ (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
cond_signed:$cc),
- (SETP_i32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), (cond2cc $cc))>;
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+ (cond2cc $cc))>;
-def: Pat<(setcc (i16 (and (trunc (bfe B32:$a, imm:$oa, 8)), 255)),
- (i16 (and (trunc (bfe B32:$b, imm:$ob, 8)), 255)),
+def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
+ (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
cond_not_signed:$cc),
- (SETP_i32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), (cond2cc $cc))>;
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+ (cond2cc $cc))>;
def SDTDeclareArrayParam :
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
@@ -1961,7 +1977,7 @@ defm ProxyRegB64 : ProxyRegInst<"b64", B64>;
class LD<NVPTXRegClass regclass>
: NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign,
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign,
i32imm:$fromWidth, ADDR:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
@@ -1977,7 +1993,7 @@ class ST<DAGOperand O>
: NVPTXInst<
(outs),
(ins O:$src,
- LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$toWidth,
+ AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$toWidth,
ADDR:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth"
" \t[$addr], $src;", []>;
@@ -1995,21 +2011,21 @@ let mayStore=1, hasSideEffects=0 in {
multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
def _v2 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
- LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v4 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
- LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
if support_v8 then
def _v8 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
- (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign,
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign,
i32imm:$fromWidth, ADDR:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
@@ -2026,14 +2042,14 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
def _v2 : NVPTXInst<
(outs),
(ins O:$src1, O:$src2,
- LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth,
+ AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
ADDR:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v4 : NVPTXInst<
(outs),
(ins O:$src1, O:$src2, O:$src3, O:$src4,
- LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth,
+ AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
ADDR:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
@@ -2042,7 +2058,7 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
(outs),
(ins O:$src1, O:$src2, O:$src3, O:$src4,
O:$src5, O:$src6, O:$src7, O:$src8,
- LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth,
+ AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
ADDR:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth "
"\t[$addr], "
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 93827be5c281..70150bdfc8d1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,46 @@ def AS_match {
}];
}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Scope Constants
+// These map to the Scope enum in NVPTX.h
+//===----------------------------------------------------------------------===//
+
+def Scope_thread : PatLeaf<(i32 0)>; // Thread = 0
+def Scope_cta : PatLeaf<(i32 1)>; // Block = 1
+def Scope_cluster : PatLeaf<(i32 2)>; // Cluster = 2
+def Scope_device : PatLeaf<(i32 3)>; // Device = 3
+def Scope_sys : PatLeaf<(i32 4)>; // System = 4
+
+//===----------------------------------------------------------------------===//
+// NVPTX Address Space Constants
+// These map to the AddressSpace enum in NVPTX.h
+//===----------------------------------------------------------------------===//
+
+def AddrSpace_gen : PatLeaf<(i32 0)>; // Generic = 0
+def AddrSpace_global : PatLeaf<(i32 1)>; // Global = 1
+def AddrSpace_shared : PatLeaf<(i32 3)>; // Shared = 3
+def AddrSpace_const : PatLeaf<(i32 4)>; // Const = 4
+def AddrSpace_local : PatLeaf<(i32 5)>; // Local = 5
+def AddrSpace_shared_cluster : PatLeaf<(i32 7)>; // SharedCluster = 7
+def AddrSpace_param : PatLeaf<(i32 101)>; // Param = 101
+
+//===----------------------------------------------------------------------===//
+// NVPTX Ordering Constants
+// These map to the Ordering enum in NVPTX.h
+//===----------------------------------------------------------------------===//
+
+def Ordering_not_atomic : PatLeaf<(i32 0)>; // NotAtomic = 0
+def Ordering_relaxed : PatLeaf<(i32 2)>; // Relaxed = 1
+def Ordering_acquire : PatLeaf<(i32 4)>; // Acquire = 4
+def Ordering_release : PatLeaf<(i32 5)>; // Release = 5
+def Ordering_acquire_release : PatLeaf<(i32 6)>; // AcquireRelease = 6
+def Ordering_sequentially_consistent : PatLeaf<(i32 7)>; // SequentiallyConsistent = 7
+def Ordering_volatile : PatLeaf<(i32 8)>; // Volatile = 8
+def Ordering_relaxed_mmio : PatLeaf<(i32 9)>; // RelaxedMMIO = 9
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -1007,24 +1047,6 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
// MISC
//
-class PRMT3Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
- : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c),
- (PRMT_B32rrr $a, $b, $c, prmt_mode)>;
-
-class PRMT2Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
- : Pat<(prmt_intrinsic i32:$a, i32:$c),
- (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>;
-
-def : PRMT3Pat<int_nvvm_prmt, PrmtNONE>;
-def : PRMT3Pat<int_nvvm_prmt_f4e, PrmtF4E>;
-def : PRMT3Pat<int_nvvm_prmt_b4e, PrmtB4E>;
-
-def : PRMT2Pat<int_nvvm_prmt_rc8, PrmtRC8>;
-def : PRMT2Pat<int_nvvm_prmt_ecl, PrmtECL>;
-def : PRMT2Pat<int_nvvm_prmt_ecr, PrmtECR>;
-def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>;
-
-
def INT_NVVM_NANOSLEEP_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32",
[(int_nvvm_nanosleep imm:$i)]>,
Requires<[hasPTX<63>, hasSM<70>]>;
@@ -1860,35 +1882,50 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
}
-// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
- SDPatternOperator op, list<Predicate> preds> {
- defvar asm_str = "atom" # sem_str # as_str # "." # op_str;
+multiclass F_ATOMIC_3<RegTyInfo t, string op_str, SDPatternOperator op, SDNode atomic> {
+ defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str;
+
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
- def rr : BasicNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.RC:$b, t.RC:$c),
- asm_str,
- [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>,
- Requires<preds>;
+ def _rr : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+ (ins ADDR:$addr, t.RC:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ asm_str>;
- def ir : BasicNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.Imm:$b, t.RC:$c),
- asm_str,
- [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>,
- Requires<preds>;
+ def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+ (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ asm_str>;
- def ri : BasicNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.RC:$b, t.Imm:$c),
- asm_str,
- [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>,
- Requires<preds>;
+ def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+ (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ asm_str>;
- def ii : BasicNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.Imm:$b, t.Imm:$c),
- asm_str,
- [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>,
- Requires<preds>;
+ def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+ (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ asm_str>;
}
+
+ defvar GetSem = SDNodeXForm<atomic, [{
+ return getI32Imm(getMemOrder(cast<MemSDNode>(N)), SDLoc(N));
+ }]>;
+
+ defvar GetScope = SDNodeXForm<atomic, [{
+ return getI32Imm(getAtomicScope(cast<MemSDNode>(N)), SDLoc(N));
+ }]>;
+
+ defvar GetAddSp = SDNodeXForm<atomic, [{
+ return getI32Imm(getAddrSpace(cast<MemSDNode>(N)), SDLoc(N));
+ }]>;
+
+ def : Pat<(op:$this addr:$addr, t.Ty:$b, t.Ty:$c),
+ (!cast<Instruction>(NAME # _rr) ADDR:$addr, t.Ty:$b, t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>;
+
+ def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c),
+ (!cast<Instruction>(NAME # _ir) ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>;
+
+ def : Pat<(op:$this addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)),
+ (!cast<Instruction>(NAME # _ri) ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>;
+
+ def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)),
+ (!cast<Instruction>(NAME # _ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>;
}
multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, list<Predicate> preds = []> {
@@ -1899,14 +1936,6 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
- defvar frag_pat = (frag node:$a, node:$b, node:$c);
- defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
-}
-
// atom_add
defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2_AS<I32RT, atomic_load_add_i32, "add.u32">;
defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS<I64RT, atomic_load_add_i64, "add.u64">;
@@ -1951,23 +1980,12 @@ defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS<I64RT, atomic_load_xor_i64, "xor.b64",
// Define atom.cas for all combinations of size x addrspace x memory order
// supported in PTX *and* on the hardware.
-foreach t = [I32RT, I64RT] in {
- foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
- defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
- defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
- // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
- // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
- // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
- defm INT_PTX_ATOM_CAS_#t.Size#_#order
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
- defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
- }
+foreach t = [I16RT, I32RT, I64RT] in {
+ defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size);
+ defm INT_PTX_ATOM_CAS_#t.Size
+ : F_ATOMIC_3<t, ".cas.b"#t.Size, atomic_cmp_swap_pat, atomic_cmp_swap>;
}
-// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
-
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
// and converts it into the appropriate instruction.
@@ -1991,19 +2009,6 @@ multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
# !if(!empty(ScopeStr), "", "_" # ScopeStr)),
preds = Preds>;
}
-multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
- string ScopeStr, string SpaceStr,
- RegTyInfo t, list<Predicate> Preds> {
- defm "" : F_ATOMIC_3<t,
- as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
- sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
- op_str = OpStr # "." # TypeStr,
- op = !cast<Intrinsic>(
- "int_nvvm_atomic_" # OpStr
- # "_" # SpaceStr # "_" # IntTypeStr
- # !if(!empty(ScopeStr), "", "_" # ScopeStr)),
- preds = Preds>;
-}
// Constructs variants for different scopes of atomic op.
multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr,
@@ -2018,15 +2023,22 @@ multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr,
}
}
}
-multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr,
- RegTyInfo t, list<Predicate> Preds> {
- // No need to define ".gpu"-scoped atomics. They do the same thing
- // as the regular, non-scoped atomics defined elsewhere.
+
+multiclass F_ATOMIC_3_INTRINSIC_PATTERN<RegTyInfo t, string OpStr, string InstructionName> {
foreach scope = ["cta", "sys"] in {
- // For now we only need variants for generic space pointers.
foreach space = ["gen"] in {
- defm _#scope#space : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, scope, space,
- t, !listconcat(Preds, [hasAtomScope])>;
+ defvar intrinsic = !cast<SDPatternOperator>("int_nvvm_atomic_" # OpStr # "_" # space # "_i_" # scope);
+ def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, t.Ty:$c)),
+ (!cast<Instruction>(InstructionName # "_rr") ADDR:$addr, t.Ty:$b, t.Ty:$c, Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>;
+
+ def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c)),
+ (!cast<Instruction>(InstructionName # "_ir") ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>;
+
+ def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c))),
+ (!cast<Instruction>(InstructionName # "_ri") ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>;
+
+ def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c))),
+ (!cast<Instruction>(InstructionName # "_ii") ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>;
}
}
}
@@ -2069,9 +2081,9 @@ multiclass ATOM2_incdec_impl<string OpStr> {
// atom.cas
multiclass ATOM3_cas_impl<string OpStr> {
- defm _b16 : ATOM3S_impl<OpStr, "i", "b16", I16RT, []>;
- defm _b32 : ATOM3S_impl<OpStr, "i", "b32", I32RT, []>;
- defm _b64 : ATOM3S_impl<OpStr, "i", "b64", I64RT, []>;
+ defm _b16 : F_ATOMIC_3_INTRINSIC_PATTERN<I16RT, OpStr, "INT_PTX_ATOM_CAS_16">;
+ defm _b32 : F_ATOMIC_3_INTRINSIC_PATTERN<I32RT, OpStr, "INT_PTX_ATOM_CAS_32">;
+ defm _b64 : F_ATOMIC_3_INTRINSIC_PATTERN<I64RT, OpStr, "INT_PTX_ATOM_CAS_64">;
}
defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">;
@@ -2137,7 +2149,7 @@ def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", B32>;
// during the lifetime of the kernel.
class LDG_G<NVPTXRegClass regclass>
- : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+ : NVPTXInst<(outs regclass:$result), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
"ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
def LD_GLOBAL_NC_i8 : LDG_G<B16>;
@@ -2150,19 +2162,19 @@ def LD_GLOBAL_NC_i64 : LDG_G<B64>;
// Elementized vector ldg
class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+ (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
"ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+ (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
"ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
- (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+ (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
"ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 88d3eefcc521..4eb452f39822 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -165,6 +165,8 @@ inline std::string ScopeToString(Scope S) {
return "Cluster";
case Scope::Device:
return "Device";
+ case Scope::DefaultDevice:
+ return "DefaultDevice";
}
report_fatal_error(formatv("Unknown NVPTX::Scope \"{}\".",
static_cast<ScopeUnderlyingType>(S)));
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 53312e36fb9d..a5d3be40c5cf 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -96,7 +96,7 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
// determine the type of the relocation
unsigned Type = 0;
if (IsPCRel) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
llvm_unreachable("Unimplemented");
case PPC::fixup_ppc_br24:
@@ -173,8 +173,9 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
break;
}
} else {
- switch (Fixup.getTargetKind()) {
- default: llvm_unreachable("invalid fixup kind!");
+ switch (Fixup.getKind()) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
case PPC::fixup_ppc_br24abs:
Type = ELF::R_PPC_ADDR24;
break;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index ee99cfc7d655..2dbc31fce72c 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -155,11 +155,10 @@ void PPCELFStreamer::emitGOTToPCRelReloc(const MCInst &Inst) {
const MCExpr *SubExpr2 =
MCBinaryExpr::createSub(CurrentLocationExpr, SubExpr, getContext());
- MCDataFragment *DF = static_cast<MCDataFragment *>(LabelSym->getFragment());
- assert(DF && "Expecting a valid data fragment.");
- MCFixupKind FixupKind = static_cast<MCFixupKind>(FirstLiteralRelocationKind +
- ELF::R_PPC64_PCREL_OPT);
- DF->addFixup(MCFixup::create(LabelSym->getOffset() - 8, SubExpr2, FixupKind));
+ MCFragment *F = LabelSym->getFragment();
+ F->addFixup(
+ MCFixup::create(LabelSym->getOffset() - 8, SubExpr2,
+ FirstLiteralRelocationKind + ELF::R_PPC64_PCREL_OPT));
emitLabel(CurrentLocation, Inst.getLoc());
}
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 1521ad5f4502..a091b21f4a79 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2425,8 +2425,7 @@ void PPCAIXAsmPrinter::emitTracebackTable() {
// Set the 4th byte of the mandatory field.
FirstHalfOfMandatoryField |= TracebackTable::IsFunctionNamePresentMask;
- const PPCRegisterInfo *RegInfo =
- static_cast<const PPCRegisterInfo *>(Subtarget->getRegisterInfo());
+ const PPCRegisterInfo *RegInfo = Subtarget->getRegisterInfo();
Register FrameReg = RegInfo->getFrameRegister(*MF);
if (FrameReg == (Subtarget->isPPC64() ? PPC::X31 : PPC::R31))
FirstHalfOfMandatoryField |= TracebackTable::IsAllocaUsedMask;
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 66f4aade380f..a143d85f61ec 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1199,6 +1199,14 @@ public:
addExpr(Inst, getImm(), isRV64Imm());
}
+ void addSImm10UnsignedOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ int64_t Imm;
+ [[maybe_unused]] bool IsConstant = evaluateConstantImm(getImm(), Imm);
+ assert(IsConstant);
+ Inst.addOperand(MCOperand::createImm(SignExtend64<10>(Imm)));
+ }
+
void addFPImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
if (isImm()) {
@@ -1650,6 +1658,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_InvalidSImm26:
return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 25),
(1 << 25) - 1);
+ // HACK: See comment before `BareSymbolQC_E_LI` in RISCVInstrInfoXqci.td.
+ case Match_InvalidBareSymbolQC_E_LI:
+ LLVM_FALLTHROUGH;
+ // END HACK
case Match_InvalidBareSImm32:
return generateImmOutOfRangeError(Operands, ErrorInfo,
std::numeric_limits<int32_t>::min(),
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index b723958a6ff2..fa7bcfa0e813 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -774,7 +774,8 @@ static constexpr FeatureBitset XTHeadGroup = {
RISCV::FeatureVendorXTHeadVdot};
static constexpr FeatureBitset XAndesGroup = {
- RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVBFHCvt,
+ RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesBFHCvt,
+ RISCV::FeatureVendorXAndesVBFHCvt,
RISCV::FeatureVendorXAndesVSIntLoad, RISCV::FeatureVendorXAndesVPackFPH,
RISCV::FeatureVendorXAndesVDot};
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 89a87798d71e..f76f8b3060d2 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -76,12 +76,13 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
{"fixup_riscv_branch", 0, 32, 0},
{"fixup_riscv_rvc_jump", 2, 11, 0},
{"fixup_riscv_rvc_branch", 0, 16, 0},
+ {"fixup_riscv_rvc_imm", 0, 16, 0},
{"fixup_riscv_call", 0, 64, 0},
{"fixup_riscv_call_plt", 0, 64, 0},
{"fixup_riscv_qc_e_branch", 0, 48, 0},
{"fixup_riscv_qc_e_32", 16, 32, 0},
- {"fixup_riscv_qc_abs20_u", 12, 20, 0},
+ {"fixup_riscv_qc_abs20_u", 0, 32, 0},
{"fixup_riscv_qc_e_call_plt", 0, 48, 0},
// Andes fixups
@@ -103,12 +104,13 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
return Infos[Kind - FirstTargetFixupKind];
}
-bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &,
+ const MCFixup &Fixup,
const MCValue &,
uint64_t Value,
bool Resolved) const {
int64_t Offset = int64_t(Value);
- unsigned Kind = Fixup.getTargetKind();
+ auto Kind = Fixup.getKind();
// Return true if the symbol is unresolved.
if (!Resolved)
@@ -134,6 +136,10 @@ bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
// For jump instructions the immediate must be in the range
// [-1048576, 1048574]
return Offset > 1048574 || Offset < -1048576;
+ case RISCV::fixup_riscv_rvc_imm:
+ // This fixup can never be emitted as a relocation, so always needs to be
+ // relaxed.
+ return true;
}
}
@@ -152,6 +158,18 @@ static unsigned getRelaxedOpcode(unsigned Opcode, ArrayRef<MCOperand> Operands,
// This only relaxes one "step" - i.e. from C.J to JAL, not from C.J to
// QC.E.J, because we can always relax again if needed.
return RISCV::JAL;
+ case RISCV::C_LI:
+ if (!STI.hasFeature(RISCV::FeatureVendorXqcili))
+ break;
+ // We only need this because `QC.E.LI` can be compressed into a `C.LI`. This
+ // happens because the `simm6` MCOperandPredicate accepts bare symbols, and
+ // `QC.E.LI` is the only instruction that accepts bare symbols at parse-time
+ // and compresses to `C.LI`. `C.LI` does not itself accept bare symbols at
+ // parse time.
+ //
+ // If we have a bare symbol, we need to turn this back to a `QC.E.LI`, as we
+ // have no way to emit a relocation on a `C.LI` instruction.
+ return RISCV::QC_E_LI;
case RISCV::JAL: {
// We can only relax JAL if we have Xqcilb
if (!STI.hasFeature(RISCV::FeatureVendorXqcilb))
@@ -240,6 +258,23 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
Res.addOperand(Inst.getOperand(1));
break;
}
+ case RISCV::C_LI: {
+ // This should only be hit when trying to relax a `C.LI` into a `QC.E.LI`
+ // because the `C.LI` has a bare symbol. We cannot use
+ // `RISCVRVC::uncompress` because it will use decompression patterns. The
+ // `QC.E.LI` compression pattern to `C.LI` is compression-only (because we
+ // don't want `c.li` ever printed as `qc.e.li`, which might be done if the
+ // pattern applied to decompression), but that doesn't help much becuase
+ // `C.LI` with a bare symbol will decompress to an `ADDI` anyway (because
+ // `simm12`'s MCOperandPredicate accepts a bare symbol and that pattern
+ // comes first), and we still cannot emit an `ADDI` with a bare symbol.
+ assert(STI.hasFeature(RISCV::FeatureVendorXqcili) &&
+ "C.LI is only relaxable with Xqcili");
+ Res.setOpcode(getRelaxedOpcode(Inst.getOpcode(), Inst.getOperands(), STI));
+ Res.addOperand(Inst.getOperand(0));
+ Res.addOperand(Inst.getOperand(1));
+ break;
+ }
case RISCV::BEQ:
case RISCV::BNE:
case RISCV::BLT:
@@ -267,14 +302,14 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
Inst = std::move(Res);
}
-bool RISCVAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
+bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
bool &WasRelaxed) const {
MCContext &C = getContext();
- int64_t LineDelta = DF.getLineDelta();
- const MCExpr &AddrDelta = DF.getAddrDelta();
+ int64_t LineDelta = F.getDwarfLineDelta();
+ const MCExpr &AddrDelta = F.getDwarfAddrDelta();
SmallVector<MCFixup, 1> Fixups;
- size_t OldSize = DF.getContents().size();
+ size_t OldSize = F.getVarSize();
int64_t Value;
[[maybe_unused]] bool IsAbsolute =
@@ -327,17 +362,16 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
OS << uint8_t(dwarf::DW_LNS_copy);
}
- DF.setContents(Data);
- DF.setFixups(Fixups);
+ F.setVarContents(Data);
+ F.setVarFixups(Fixups);
WasRelaxed = OldSize != Data.size();
return true;
}
-bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
- bool &WasRelaxed) const {
- const MCExpr &AddrDelta = DF.getAddrDelta();
+bool RISCVAsmBackend::relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const {
+ const MCExpr &AddrDelta = F.getDwarfAddrDelta();
SmallVector<MCFixup, 2> Fixups;
- size_t OldSize = DF.getContents().size();
+ size_t OldSize = F.getVarSize();
int64_t Value;
if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
@@ -349,9 +383,9 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
assert(getContext().getAsmInfo()->getMinInstAlignment() == 1 &&
"expected 1-byte alignment");
if (Value == 0) {
- DF.clearContents();
- DF.clearFixups();
- WasRelaxed = OldSize != DF.getContents().size();
+ F.clearVarContents();
+ F.clearVarFixups();
+ WasRelaxed = OldSize != 0;
return true;
}
@@ -382,20 +416,20 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
} else {
llvm_unreachable("unsupported CFA encoding");
}
- DF.setContents(Data);
- DF.setFixups(Fixups);
+ F.setVarContents(Data);
+ F.setVarFixups(Fixups);
WasRelaxed = OldSize != Data.size();
return true;
}
-std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF,
+std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCFragment &LF,
int64_t &Value) const {
- if (LF.isSigned())
+ if (LF.isLEBSigned())
return std::make_pair(false, false);
- const MCExpr &Expr = LF.getValue();
+ const MCExpr &Expr = LF.getLEBValue();
if (ULEB128Reloc) {
- LF.addFixup(MCFixup::create(0, &Expr, FK_Data_leb128));
+ LF.setVarFixups({MCFixup::create(0, &Expr, FK_Data_leb128)});
}
return std::make_pair(Expr.evaluateKnownAbsolute(Value, *Asm), false);
}
@@ -440,7 +474,7 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
MCContext &Ctx) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
llvm_unreachable("Unknown fixup kind!");
case FK_Data_1:
@@ -539,10 +573,18 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
(Bit5 << 2);
return Value;
}
+ case RISCV::fixup_riscv_rvc_imm: {
+ if (!isInt<6>(Value))
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+ unsigned Bit5 = (Value >> 5) & 0x1;
+ unsigned Bit4_0 = Value & 0x1f;
+ Value = (Bit5 << 12) | (Bit4_0 << 2);
+ return Value;
+ }
case RISCV::fixup_riscv_qc_e_32: {
if (!isInt<32>(Value))
Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
- return ((Value & 0xffffffff) << 16);
+ return Value & 0xffffffffu;
}
case RISCV::fixup_riscv_qc_abs20_u: {
if (!isInt<20>(Value))
@@ -620,14 +662,13 @@ static const MCFixup *getPCRelHiFixup(const MCSpecifierExpr &Expr,
const MCSymbol *AUIPCSymbol = AUIPCLoc.getAddSym();
if (!AUIPCSymbol)
return nullptr;
- const auto *DF = dyn_cast_or_null<MCDataFragment>(AUIPCSymbol->getFragment());
-
+ const auto *DF = AUIPCSymbol->getFragment();
if (!DF)
return nullptr;
uint64_t Offset = AUIPCSymbol->getOffset();
if (DF->getContents().size() == Offset) {
- DF = dyn_cast_or_null<MCDataFragment>(DF->getNext());
+ DF = DF->getNext();
if (!DF)
return nullptr;
Offset = 0;
@@ -636,7 +677,7 @@ static const MCFixup *getPCRelHiFixup(const MCSpecifierExpr &Expr,
for (const MCFixup &F : DF->getFixups()) {
if (F.getOffset() != Offset)
continue;
- auto Kind = F.getTargetKind();
+ auto Kind = F.getKind();
if (!mc::isRelocation(F.getKind())) {
if (Kind == RISCV::fixup_riscv_pcrel_hi20) {
*DFOut = DF;
@@ -664,7 +705,7 @@ std::optional<bool> RISCVAsmBackend::evaluateFixup(const MCFragment &,
const MCFixup *AUIPCFixup;
const MCFragment *AUIPCDF;
MCValue AUIPCTarget;
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
// Use default handling for `Value` and `IsResolved`.
return {};
@@ -703,14 +744,14 @@ std::optional<bool> RISCVAsmBackend::evaluateFixup(const MCFragment &,
Value = Asm->getSymbolOffset(SA) + AUIPCTarget.getConstant();
Value -= Asm->getFragmentOffset(*AUIPCDF) + AUIPCFixup->getOffset();
- return AUIPCFixup->getTargetKind() == RISCV::fixup_riscv_pcrel_hi20 &&
+ return AUIPCFixup->getKind() == RISCV::fixup_riscv_pcrel_hi20 &&
isPCRelFixupResolved(AUIPCTarget.getAddSym(), *AUIPCDF);
}
void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
const MCFixup &Fixup) {
StringRef VendorIdentifier;
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
// No Vendor Relocation Required.
return;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 1f1a6f5fe31a..8c10fbec3c8f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -60,7 +60,8 @@ public:
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override;
- bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t,
+ bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &,
+ const MCValue &, uint64_t,
bool) const override;
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
@@ -72,11 +73,9 @@ public:
void relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const override;
- bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
- bool &WasRelaxed) const override;
- bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
- bool &WasRelaxed) const override;
- std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF,
+ bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
+ bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
+ std::pair<bool, bool> relaxLEB128(MCFragment &LF,
int64_t &Value) const override;
bool writeNopData(raw_ostream &OS, uint64_t Count,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index f41ad419db1a..7ad5d5f3118b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -339,7 +339,6 @@ enum OperandType : unsigned {
OPERAND_SIMM6,
OPERAND_SIMM6_NONZERO,
OPERAND_SIMM10,
- OPERAND_SIMM10_UNSIGNED,
OPERAND_SIMM10_LSB0000_NONZERO,
OPERAND_SIMM11,
OPERAND_SIMM12,
@@ -495,6 +494,17 @@ inline static bool isValidRoundingMode(unsigned Mode) {
}
} // namespace RISCVVXRndMode
+namespace RISCVExceptFlags {
+enum ExceptionFlag {
+ NX = 0x01, // Inexact
+ UF = 0x02, // Underflow
+ OF = 0x04, // Overflow
+ DZ = 0x08, // Divide by zero
+ NV = 0x10, // Invalid operation
+ ALL = 0x1F // Mask for all accrued exception flags
+};
+}
+
//===----------------------------------------------------------------------===//
// Floating-point Immediates
//
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 8ab2c56ae317..9bf7896e1f1e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -48,7 +48,7 @@ RISCVELFObjectWriter::~RISCVELFObjectWriter() = default;
unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
const MCValue &Target,
bool IsPCRel) const {
- unsigned Kind = Fixup.getTargetKind();
+ auto Kind = Fixup.getKind();
auto Spec = Target.getSpecifier();
switch (Spec) {
case ELF::R_RISCV_TPREL_HI20:
@@ -135,6 +135,9 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
return ELF::R_RISCV_LO12_I;
case RISCV::fixup_riscv_lo12_s:
return ELF::R_RISCV_LO12_S;
+ case RISCV::fixup_riscv_rvc_imm:
+ reportError(Fixup.getLoc(), "No relocation for CI-type instructions");
+ return ELF::R_RISCV_NONE;
case RISCV::fixup_riscv_qc_e_32:
return ELF::R_RISCV_QC_E_32;
case RISCV::fixup_riscv_qc_abs20_u:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index c1cdf511fae5..f816561ccf3f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -40,12 +40,16 @@ enum Fixups {
fixup_riscv_rvc_jump,
// 8-bit fixup for symbol references in the compressed branch instruction
fixup_riscv_rvc_branch,
+ // 6-bit fixup for symbol references in instructions like c.li
+ fixup_riscv_rvc_imm,
// Fixup representing a legacy no-pic function call attached to the auipc
// instruction in a pair composed of adjacent auipc+jalr instructions.
fixup_riscv_call,
// Fixup representing a function call attached to the auipc instruction in a
// pair composed of adjacent auipc+jalr instructions.
fixup_riscv_call_plt,
+
+ // Qualcomm specific fixups
// 12-bit fixup for symbol references in the 48-bit Xqcibi branch immediate
// instructions
fixup_riscv_qc_e_branch,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 2ed7cd9f008a..cbeabdddb937 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -650,6 +650,8 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
FixupKind = RISCV::fixup_riscv_rvc_jump;
} else if (MIFrm == RISCVII::InstFormatCB) {
FixupKind = RISCV::fixup_riscv_rvc_branch;
+ } else if (MIFrm == RISCVII::InstFormatCI) {
+ FixupKind = RISCV::fixup_riscv_rvc_imm;
} else if (MIFrm == RISCVII::InstFormatI) {
FixupKind = RISCV::fixup_riscv_12_i;
} else if (MIFrm == RISCVII::InstFormatQC_EB) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index f66c2d5f99cb..61ecfb278a7d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -30,6 +30,7 @@
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include <bitset>
#define GET_INSTRINFO_MC_DESC
@@ -305,6 +306,47 @@ public:
}
}
+ /// Returns (PLT virtual address, GOT virtual address) pairs for PLT entries.
+ std::vector<std::pair<uint64_t, uint64_t>>
+ findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
+ const MCSubtargetInfo &STI) const override {
+ uint32_t LoadInsnOpCode;
+ if (const Triple &T = STI.getTargetTriple(); T.isRISCV64())
+ LoadInsnOpCode = 0x3003; // ld
+ else if (T.isRISCV32())
+ LoadInsnOpCode = 0x2003; // lw
+ else
+ return {};
+
+ constexpr uint64_t FirstEntryAt = 32, EntrySize = 16;
+ if (PltContents.size() < FirstEntryAt + EntrySize)
+ return {};
+
+ std::vector<std::pair<uint64_t, uint64_t>> Results;
+ for (uint64_t EntryStart = FirstEntryAt,
+ EntryStartEnd = PltContents.size() - EntrySize;
+ EntryStart <= EntryStartEnd; EntryStart += EntrySize) {
+ const uint32_t AuipcInsn =
+ support::endian::read32le(PltContents.data() + EntryStart);
+ const bool IsAuipc = (AuipcInsn & 0x7F) == 0x17;
+ if (!IsAuipc)
+ continue;
+
+ const uint32_t LoadInsn =
+ support::endian::read32le(PltContents.data() + EntryStart + 4);
+ const bool IsLoad = (LoadInsn & 0x707F) == LoadInsnOpCode;
+ if (!IsLoad)
+ continue;
+
+ const uint64_t GotPltSlotVA = PltSectionVA + EntryStart +
+ (AuipcInsn & 0xFFFFF000) +
+ SignExtend64<12>(LoadInsn >> 20);
+ Results.emplace_back(PltSectionVA + EntryStart, GotPltSlotVA);
+ }
+
+ return Results;
+ }
+
private:
static bool maybeReturnAddress(MCRegister Reg) {
// X1 is used for normal returns, X5 for returns from outlined functions.
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index bf58226e0bd3..f9c0b54be7a2 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -179,7 +179,6 @@ def FeatureStdExtZicfiss
def HasStdExtZicfiss : Predicate<"Subtarget->hasStdExtZicfiss()">,
AssemblerPredicate<(all_of FeatureStdExtZicfiss),
"'Zicfiss' (Shadow stack)">;
-def NoHasStdExtZicfiss : Predicate<"!Subtarget->hasStdExtZicfiss()">;
def FeatureStdExtZilsd
: RISCVExtension<1, 0,
@@ -188,7 +187,6 @@ def FeatureStdExtZilsd
def HasStdExtZilsd : Predicate<"Subtarget->hasStdExtZilsd()">,
AssemblerPredicate<(all_of FeatureStdExtZilsd),
"'Zilsd' (Load/Store pair instructions)">;
-def NoHasStdExtZilsd : Predicate<"!Subtarget->hasStdExtZilsd()">;
// Multiply Extensions
@@ -1487,6 +1485,11 @@ def HasVendorXqcics
: Predicate<"Subtarget->hasVendorXqcics()">,
AssemblerPredicate<(all_of FeatureVendorXqcics),
"'Xqcics' (Qualcomm uC Conditional Select Extension)">;
+def NoVendorXqcics
+ : Predicate<"!Subtarget->hasVendorXqcics()">;
+
+def HasVendorXqcicsOrXqcicm
+ : Predicate<"Subtarget->hasVendorXqcics() || Subtarget->hasVendorXqcicm()">;
def FeatureVendorXqcicsr
: RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">;
@@ -1599,6 +1602,14 @@ def HasVendorXAndesPerf
AssemblerPredicate<(all_of FeatureVendorXAndesPerf),
"'XAndesPerf' (Andes Performance Extension)">;
+def FeatureVendorXAndesBFHCvt
+ : RISCVExtension<5, 0, "Andes Scalar BFLOAT16 Conversion Extension",
+ [FeatureStdExtF]>;
+def HasVendorXAndesBFHCvt
+ : Predicate<"Subtarget->hasVendorXAndesBFHCvt()">,
+ AssemblerPredicate<(all_of FeatureVendorXAndesBFHCvt),
+ "'XAndesBFHCvt' (Andes Scalar BFLOAT16 Conversion Extension)">;
+
def FeatureVendorXAndesVBFHCvt
: RISCVExtension<5, 0, "Andes Vector BFLOAT16 Conversion Extension",
[FeatureStdExtZve32f]>;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index a796c910bd44..23b455434900 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -95,6 +95,11 @@ static const std::pair<MCPhysReg, int8_t> FixedCSRFIQCIInterruptMap[] = {
/* -21, -22, -23, -24 are reserved */
};
+/// Returns true if DWARF CFI instructions ("frame moves") should be emitted.
+static bool needsDwarfCFI(const MachineFunction &MF) {
+ return MF.needsFrameMoves();
+}
+
// For now we use x3, a.k.a gp, as pointer to shadow call stack.
// User should not use x3 in their asm.
static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -141,6 +146,9 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
.addImm(-SlotSize)
.setMIFlag(MachineInstr::FrameSetup);
+ if (!needsDwarfCFI(MF))
+ return;
+
// Emit a CFI instruction that causes SlotSize to be subtracted from the value
// of the shadow stack pointer when unwinding past this frame.
char DwarfSCSReg = TRI->getDwarfRegNum(SCSPReg, /*IsEH*/ true);
@@ -199,8 +207,10 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
.addReg(SCSPReg)
.addImm(-SlotSize)
.setMIFlag(MachineInstr::FrameDestroy);
- // Restore the SCS pointer
- CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg);
+ if (needsDwarfCFI(MF)) {
+ // Restore the SCS pointer
+ CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg);
+ }
}
// Insert instruction to swap mscratchsw with sp
@@ -738,7 +748,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
MachineFunction &MF, uint64_t Offset,
uint64_t RealStackSize, bool EmitCFI,
bool NeedProbe, uint64_t ProbeSize,
- bool DynAllocation) const {
+ bool DynAllocation,
+ MachineInstr::MIFlag Flag) const {
DebugLoc DL;
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();
@@ -748,7 +759,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
// Simply allocate the stack if it's not big enough to require a probe.
if (!NeedProbe || Offset <= ProbeSize) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Offset),
- MachineInstr::FrameSetup, getStackAlign());
+ Flag, getStackAlign());
if (EmitCFI)
CFIBuilder.buildDefCFAOffset(RealStackSize);
@@ -759,7 +770,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
.addReg(RISCV::X0)
.addReg(SPReg)
.addImm(0)
- .setMIFlags(MachineInstr::FrameSetup);
+ .setMIFlags(Flag);
}
return;
@@ -770,14 +781,13 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
uint64_t CurrentOffset = 0;
while (CurrentOffset + ProbeSize <= Offset) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
- StackOffset::getFixed(-ProbeSize), MachineInstr::FrameSetup,
- getStackAlign());
+ StackOffset::getFixed(-ProbeSize), Flag, getStackAlign());
// s[d|w] zero, 0(sp)
BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
.addReg(RISCV::X0)
.addReg(SPReg)
.addImm(0)
- .setMIFlags(MachineInstr::FrameSetup);
+ .setMIFlags(Flag);
CurrentOffset += ProbeSize;
if (EmitCFI)
@@ -787,8 +797,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
uint64_t Residual = Offset - CurrentOffset;
if (Residual) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
- StackOffset::getFixed(-Residual), MachineInstr::FrameSetup,
- getStackAlign());
+ StackOffset::getFixed(-Residual), Flag, getStackAlign());
if (EmitCFI)
CFIBuilder.buildDefCFAOffset(Offset);
@@ -798,7 +807,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
.addReg(RISCV::X0)
.addReg(SPReg)
.addImm(0)
- .setMIFlags(MachineInstr::FrameSetup);
+ .setMIFlags(Flag);
}
}
@@ -812,8 +821,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
Register TargetReg = RISCV::X6;
// SUB TargetReg, SP, RoundedSize
RI->adjustReg(MBB, MBBI, DL, TargetReg, SPReg,
- StackOffset::getFixed(-RoundedSize), MachineInstr::FrameSetup,
- getStackAlign());
+ StackOffset::getFixed(-RoundedSize), Flag, getStackAlign());
if (EmitCFI) {
// Set the CFA register to TargetReg.
@@ -830,14 +838,14 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
if (Residual) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual),
- MachineInstr::FrameSetup, getStackAlign());
+ Flag, getStackAlign());
if (DynAllocation) {
// s[d|w] zero, 0(sp)
BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
.addReg(RISCV::X0)
.addReg(SPReg)
.addImm(0)
- .setMIFlags(MachineInstr::FrameSetup);
+ .setMIFlags(Flag);
}
}
@@ -937,6 +945,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() +
getUnmanagedCSI(MF, CSI).size());
CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
+ bool NeedsDwarfCFI = needsDwarfCFI(MF);
// If libcalls are used to spill and restore callee-saved registers, the frame
// has two sections; the opaque section managed by the libcalls, and the
@@ -964,10 +973,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign());
RVFI->setLibCallStackSize(LibCallFrameSize);
- CFIBuilder.buildDefCFAOffset(LibCallFrameSize);
- for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
- CFIBuilder.buildOffset(CS.getReg(),
- MFI.getObjectOffset(CS.getFrameIdx()));
+ if (NeedsDwarfCFI) {
+ CFIBuilder.buildDefCFAOffset(LibCallFrameSize);
+ for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
+ CFIBuilder.buildOffset(CS.getReg(),
+ MFI.getObjectOffset(CS.getFrameIdx()));
+ }
}
// FIXME (note copied from Lanai): This appears to be overallocating. Needs
@@ -998,14 +1009,17 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// could only be the next instruction.
++PossiblePush;
- // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)`
- // could be. The PUSH will also get its own CFI metadata for its own
- // modifications, which should come after the PUSH.
- CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, MachineInstr::FrameSetup);
- PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount);
- for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI))
- PushCFIBuilder.buildOffset(CS.getReg(),
- MFI.getObjectOffset(CS.getFrameIdx()));
+ if (NeedsDwarfCFI) {
+ // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)`
+ // could be. The PUSH will also get its own CFI metadata for its own
+ // modifications, which should come after the PUSH.
+ CFIInstBuilder PushCFIBuilder(MBB, PossiblePush,
+ MachineInstr::FrameSetup);
+ PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount);
+ for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI))
+ PushCFIBuilder.buildOffset(CS.getReg(),
+ MFI.getObjectOffset(CS.getFrameIdx()));
+ }
}
if (RVFI->isPushable(MF) && PossiblePush != MBB.end() &&
@@ -1019,10 +1033,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
PossiblePush->getOperand(1).setImm(StackAdj);
StackSize -= StackAdj;
- CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize);
- for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
- CFIBuilder.buildOffset(CS.getReg(),
- MFI.getObjectOffset(CS.getFrameIdx()));
+ if (NeedsDwarfCFI) {
+ CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize);
+ for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
+ CFIBuilder.buildOffset(CS.getReg(),
+ MFI.getObjectOffset(CS.getFrameIdx()));
+ }
}
// Allocate space on the stack if necessary.
@@ -1033,8 +1049,9 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
bool DynAllocation =
MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
if (StackSize != 0)
- allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true,
- NeedProbe, ProbeSize, DynAllocation);
+ allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, NeedsDwarfCFI,
+ NeedProbe, ProbeSize, DynAllocation,
+ MachineInstr::FrameSetup);
// Save SiFive CLIC CSRs into Stack
emitSiFiveCLICPreemptibleSaves(MF, MBB, MBBI, DL);
@@ -1050,8 +1067,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// Iterate over list of callee-saved registers and emit .cfi_offset
// directives.
- for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
- CFIBuilder.buildOffset(CS.getReg(), MFI.getObjectOffset(CS.getFrameIdx()));
+ if (NeedsDwarfCFI)
+ for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
+ CFIBuilder.buildOffset(CS.getReg(),
+ MFI.getObjectOffset(CS.getFrameIdx()));
// Generate new FP.
if (hasFP(MF)) {
@@ -1070,7 +1089,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MachineInstr::FrameSetup, getStackAlign());
}
- CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize());
+ if (NeedsDwarfCFI)
+ CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize());
}
uint64_t SecondSPAdjustAmount = 0;
@@ -1081,15 +1101,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
"SecondSPAdjustAmount should be greater than zero");
allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount,
- getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe,
- ProbeSize, DynAllocation);
+ getStackSizeWithRVVPadding(MF), NeedsDwarfCFI && !hasFP(MF),
+ NeedProbe, ProbeSize, DynAllocation,
+ MachineInstr::FrameSetup);
}
if (RVVStackSize) {
if (NeedProbe) {
allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
- MachineInstr::FrameSetup, !hasFP(MF),
- DynAllocation);
+ MachineInstr::FrameSetup,
+ NeedsDwarfCFI && !hasFP(MF), DynAllocation);
} else {
// We must keep the stack pointer aligned through any intermediate
// updates.
@@ -1098,14 +1119,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MachineInstr::FrameSetup, getStackAlign());
}
- if (!hasFP(MF)) {
+ if (NeedsDwarfCFI && !hasFP(MF)) {
// Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb".
CFIBuilder.insertCFIInst(createDefCFAExpression(
*RI, SPReg, getStackSizeWithRVVPadding(MF), RVVStackSize / 8));
}
std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size());
- emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF));
+ if (NeedsDwarfCFI)
+ emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF));
}
if (hasFP(MF)) {
@@ -1172,8 +1194,9 @@ void RISCVFrameLowering::deallocateStack(MachineFunction &MF,
MachineInstr::FrameDestroy, getStackAlign());
StackSize = 0;
- CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
- .buildDefCFAOffset(CFAOffset);
+ if (needsDwarfCFI(MF))
+ CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
+ .buildDefCFAOffset(CFAOffset);
}
void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -1213,6 +1236,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
std::next(MBBI, getRVVCalleeSavedInfo(MF, CSI).size());
CFIInstBuilder CFIBuilder(MBB, FirstScalarCSRRestoreInsn,
MachineInstr::FrameDestroy);
+ bool NeedsDwarfCFI = needsDwarfCFI(MF);
uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount
@@ -1233,10 +1257,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
StackOffset::getScalable(RVVStackSize),
MachineInstr::FrameDestroy, getStackAlign());
- if (!hasFP(MF))
- CFIBuilder.buildDefCFA(SPReg, RealStackSize);
-
- emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn);
+ if (NeedsDwarfCFI) {
+ if (!hasFP(MF))
+ CFIBuilder.buildDefCFA(SPReg, RealStackSize);
+ emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn);
+ }
}
if (FirstSPAdjustAmount) {
@@ -1252,7 +1277,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
StackOffset::getFixed(SecondSPAdjustAmount),
MachineInstr::FrameDestroy, getStackAlign());
- if (!hasFP(MF))
+ if (NeedsDwarfCFI && !hasFP(MF))
CFIBuilder.buildDefCFAOffset(FirstSPAdjustAmount);
}
@@ -1273,7 +1298,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
getStackAlign());
}
- if (hasFP(MF))
+ if (NeedsDwarfCFI && hasFP(MF))
CFIBuilder.buildDefCFA(SPReg, RealStackSize);
// Skip to after the restores of scalar callee-saved registers
@@ -1296,8 +1321,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
}
// Recover callee-saved registers.
- for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
- CFIBuilder.buildRestore(CS.getReg());
+ if (NeedsDwarfCFI)
+ for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
+ CFIBuilder.buildRestore(CS.getReg());
if (RVFI->isPushable(MF) && MBBI != MBB.end() && isPop(MBBI->getOpcode())) {
// Use available stack adjustment in pop instruction to deallocate stack
@@ -1316,15 +1342,17 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
auto NextI = next_nodbg(MBBI, MBB.end());
if (NextI == MBB.end() || NextI->getOpcode() != RISCV::PseudoRET) {
++MBBI;
- CFIBuilder.setInsertPoint(MBBI);
+ if (NeedsDwarfCFI) {
+ CFIBuilder.setInsertPoint(MBBI);
- for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
- CFIBuilder.buildRestore(CS.getReg());
+ for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
+ CFIBuilder.buildRestore(CS.getReg());
- // Update CFA Offset. If this is a QCI interrupt function, there will be a
- // leftover offset which is deallocated by `QC.C.MILEAVERET`, otherwise
- // getQCIInterruptStackSize() will be 0.
- CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize());
+ // Update CFA Offset. If this is a QCI interrupt function, there will
+ // be a leftover offset which is deallocated by `QC.C.MILEAVERET`,
+ // otherwise getQCIInterruptStackSize() will be 0.
+ CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize());
+ }
}
}
@@ -1813,8 +1841,10 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
// allocateStack.
bool DynAllocation =
MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
- allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF),
- /*NeedProbe=*/true, ProbeSize, DynAllocation);
+ allocateStack(MBB, MI, MF, -Amount, -Amount,
+ needsDwarfCFI(MF) && !hasFP(MF),
+ /*NeedProbe=*/true, ProbeSize, DynAllocation,
+ MachineInstr::NoFlags);
} else {
const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount),
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index d013755ce58a..6af63a4885f3 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -81,7 +81,8 @@ public:
void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineFunction &MF, uint64_t Offset,
uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
- uint64_t ProbeSize, bool DynAllocation) const;
+ uint64_t ProbeSize, bool DynAllocation,
+ MachineInstr::MIFlag Flag) const;
protected:
const RISCVSubtarget &STI;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index c97b14a254cd..cfec46d23d65 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -689,10 +689,16 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
if (!isShiftedMask_32(C1) || isInt<12>(C1))
return false;
+ // INSBI will clobber the input register in N0. Bail out if we need a copy to
+ // preserve this value.
+ SDValue N0 = Node->getOperand(0);
+ if (!N0.hasOneUse())
+ return false;
+
// If C1 is a shifted mask (but can't be formed as an ORI),
// use a bitfield insert of -1.
// Transform (or x, C1)
- // -> (qc.insbi x, width, shift)
+ // -> (qc.insbi x, -1, width, shift)
const unsigned Leading = llvm::countl_zero((uint32_t)C1);
const unsigned Trailing = llvm::countr_zero((uint32_t)C1);
const unsigned Width = 32 - Leading - Trailing;
@@ -705,7 +711,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
SDLoc DL(Node);
MVT VT = Node->getSimpleValueType(0);
- SDValue Ops[] = {CurDAG->getSignedTargetConstant(-1, DL, VT),
+ SDValue Ops[] = {N0, CurDAG->getSignedTargetConstant(-1, DL, VT),
CurDAG->getTargetConstant(Width, DL, VT),
CurDAG->getTargetConstant(Trailing, DL, VT)};
SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops);
@@ -2842,56 +2848,6 @@ static bool isWorthFoldingAdd(SDValue Add) {
return true;
}
-bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
- unsigned MaxShiftAmount,
- SDValue &Base, SDValue &Index,
- SDValue &Scale) {
- EVT VT = Addr.getSimpleValueType();
- auto UnwrapShl = [this, VT, MaxShiftAmount](SDValue N, SDValue &Index,
- SDValue &Shift) {
- uint64_t ShiftAmt = 0;
- Index = N;
-
- if (N.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N.getOperand(1))) {
- // Only match shifts by a value in range [0, MaxShiftAmount].
- if (N.getConstantOperandVal(1) <= MaxShiftAmount) {
- Index = N.getOperand(0);
- ShiftAmt = N.getConstantOperandVal(1);
- }
- }
-
- Shift = CurDAG->getTargetConstant(ShiftAmt, SDLoc(N), VT);
- return ShiftAmt != 0;
- };
-
- if (Addr.getOpcode() == ISD::ADD) {
- if (auto *C1 = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
- SDValue AddrB = Addr.getOperand(0);
- if (AddrB.getOpcode() == ISD::ADD &&
- UnwrapShl(AddrB.getOperand(0), Index, Scale) &&
- !isa<ConstantSDNode>(AddrB.getOperand(1)) &&
- isInt<12>(C1->getSExtValue())) {
- // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2))
- SDValue C1Val =
- CurDAG->getTargetConstant(C1->getZExtValue(), SDLoc(Addr), VT);
- Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
- AddrB.getOperand(1), C1Val),
- 0);
- return true;
- }
- } else if (UnwrapShl(Addr.getOperand(0), Index, Scale)) {
- Base = Addr.getOperand(1);
- return true;
- } else {
- UnwrapShl(Addr.getOperand(1), Index, Scale);
- Base = Addr.getOperand(0);
- return true;
- }
- }
-
- return false;
-}
-
bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
SDValue &Offset) {
if (SelectAddrFrameIndex(Addr, Base, Offset))
@@ -2908,7 +2864,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
if (CurDAG->isBaseWithConstantOffset(Addr)) {
int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
- if (isInt<12>(CVal) && isInt<12>(CVal)) {
+ if (isInt<12>(CVal)) {
Base = Addr.getOperand(0);
if (Base.getOpcode() == RISCVISD::ADD_LO) {
SDValue LoOperand = Base.getOperand(1);
@@ -2942,8 +2898,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
// Handle ADD with large immediates.
if (Addr.getOpcode() == ISD::ADD && isa<ConstantSDNode>(Addr.getOperand(1))) {
int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
- assert(!(isInt<12>(CVal) && isInt<12>(CVal)) &&
- "simm12 not already handled?");
+ assert(!isInt<12>(CVal) && "simm12 not already handled?");
// Handle immediates in the range [-4096,-2049] or [2048, 4094]. We can use
// an ADDI for part of the offset and fold the rest into the load/store.
@@ -2984,12 +2939,11 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
return true;
}
-/// Similar to SelectAddrRegImm, except that the offset restricted for
-/// unsinged nine bits.
+/// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9.
bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
SDValue &Offset) {
- if (SelectAddrFrameIndex(Addr, Base, Offset))
- return true;
+ // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+ // a 9-bit immediate can be folded.
SDLoc DL(Addr);
MVT VT = Addr.getSimpleValueType();
@@ -2999,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
if (isUInt<9>(CVal)) {
Base = Addr.getOperand(0);
- if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
- Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
+ // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+ // a 9-bit immediate can be folded.
Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT);
return true;
}
@@ -3078,6 +3032,80 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
return true;
}
+bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
+ unsigned MaxShiftAmount,
+ SDValue &Base, SDValue &Index,
+ SDValue &Scale) {
+ if (Addr.getOpcode() != ISD::ADD)
+ return false;
+ SDValue LHS = Addr.getOperand(0);
+ SDValue RHS = Addr.getOperand(1);
+
+ EVT VT = Addr.getSimpleValueType();
+ auto SelectShl = [this, VT, MaxShiftAmount](SDValue N, SDValue &Index,
+ SDValue &Shift) {
+ if (N.getOpcode() != ISD::SHL || !isa<ConstantSDNode>(N.getOperand(1)))
+ return false;
+
+ // Only match shifts by a value in range [0, MaxShiftAmount].
+ unsigned ShiftAmt = N.getConstantOperandVal(1);
+ if (ShiftAmt > MaxShiftAmount)
+ return false;
+
+ Index = N.getOperand(0);
+ Shift = CurDAG->getTargetConstant(ShiftAmt, SDLoc(N), VT);
+ return true;
+ };
+
+ if (auto *C1 = dyn_cast<ConstantSDNode>(RHS)) {
+ // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2))
+ if (LHS.getOpcode() == ISD::ADD &&
+ !isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ isInt<12>(C1->getSExtValue())) {
+ if (SelectShl(LHS.getOperand(1), Index, Scale)) {
+ SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
+ SDLoc(Addr), VT);
+ Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
+ LHS.getOperand(0), C1Val),
+ 0);
+ return true;
+ }
+
+ // Add is commutative so we need to check both operands.
+ if (SelectShl(LHS.getOperand(0), Index, Scale)) {
+ SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
+ SDLoc(Addr), VT);
+ Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
+ LHS.getOperand(1), C1Val),
+ 0);
+ return true;
+ }
+ }
+
+ // Don't match add with constants.
+ // FIXME: Is this profitable for large constants that have 0s in the lower
+ // 12 bits that we can materialize with LUI?
+ return false;
+ }
+
+ // Try to match a shift on the RHS.
+ if (SelectShl(RHS, Index, Scale)) {
+ Base = LHS;
+ return true;
+ }
+
+ // Try to match a shift on the LHS.
+ if (SelectShl(LHS, Index, Scale)) {
+ Base = RHS;
+ return true;
+ }
+
+ Base = LHS;
+ Index = RHS;
+ Scale = CurDAG->getTargetConstant(0, SDLoc(Addr), VT);
+ return true;
+}
+
bool RISCVDAGToDAGISel::SelectAddrRegReg(SDValue Addr, SDValue &Base,
SDValue &Offset) {
if (Addr.getOpcode() != ISD::ADD)
@@ -3776,21 +3804,18 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
// Select a constant that can be represented as (sign_extend(imm5) << imm2).
bool RISCVDAGToDAGISel::selectSimm5Shl2(SDValue N, SDValue &Simm5,
SDValue &Shl2) {
- if (auto *C = dyn_cast<ConstantSDNode>(N)) {
- int64_t Offset = C->getSExtValue();
- unsigned Shift;
- for (Shift = 0; Shift < 4; Shift++)
- if (isInt<5>(Offset >> Shift) && ((Offset % (1LL << Shift)) == 0))
- break;
-
- // Constant cannot be encoded.
- if (Shift == 4)
- return false;
+ auto *C = dyn_cast<ConstantSDNode>(N);
+ if (!C)
+ return false;
- EVT Ty = N->getValueType(0);
- Simm5 = CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(N), Ty);
- Shl2 = CurDAG->getTargetConstant(Shift, SDLoc(N), Ty);
- return true;
+ int64_t Offset = C->getSExtValue();
+ for (unsigned Shift = 0; Shift < 4; Shift++) {
+ if (isInt<5>(Offset >> Shift) && ((Offset % (1LL << Shift)) == 0)) {
+ EVT VT = N->getValueType(0);
+ Simm5 = CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(N), VT);
+ Shl2 = CurDAG->getTargetConstant(Shift, SDLoc(N), VT);
+ return true;
+ }
}
return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7c72d074a35b..4845a9c84e01 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -39,7 +39,6 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsRISCV.h"
-#include "llvm/IR/PatternMatch.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/Support/CommandLine.h"
@@ -129,7 +128,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.hasStdExtZfhmin())
addRegisterClass(MVT::f16, &RISCV::FPR16RegClass);
- if (Subtarget.hasStdExtZfbfmin())
+ if (Subtarget.hasStdExtZfbfmin() || Subtarget.hasVendorXAndesBFHCvt())
addRegisterClass(MVT::bf16, &RISCV::FPR16RegClass);
if (Subtarget.hasStdExtF())
addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
@@ -656,6 +655,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::GET_FPENV, XLenVT, Custom);
setOperationAction(ISD::SET_FPENV, XLenVT, Custom);
setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom);
+ setOperationAction(ISD::GET_FPMODE, XLenVT, Custom);
+ setOperationAction(ISD::SET_FPMODE, XLenVT, Custom);
+ setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
}
setOperationAction({ISD::GlobalAddress, ISD::BlockAddress, ISD::ConstantPool,
@@ -8226,6 +8228,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerSET_FPENV(Op, DAG);
case ISD::RESET_FPENV:
return lowerRESET_FPENV(Op, DAG);
+ case ISD::GET_FPMODE:
+ return lowerGET_FPMODE(Op, DAG);
+ case ISD::SET_FPMODE:
+ return lowerSET_FPMODE(Op, DAG);
+ case ISD::RESET_FPMODE:
+ return lowerRESET_FPMODE(Op, DAG);
case ISD::EH_DWARF_CFA:
return lowerEH_DWARF_CFA(Op, DAG);
case ISD::VP_MERGE:
@@ -11969,7 +11977,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
// Store with unit-stride store and load it back with segmented load.
MVT XLenVT = Subtarget.getXLenVT();
- SDValue VL = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget).second;
+ auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
SDValue Passthru = DAG.getUNDEF(ConcatVT);
// Allocate a stack slot.
@@ -11990,16 +11998,20 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer());
static const Intrinsic::ID VlsegIntrinsicsIds[] = {
- Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, Intrinsic::riscv_vlseg4,
- Intrinsic::riscv_vlseg5, Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
- Intrinsic::riscv_vlseg8};
+ Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+ Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+ Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+ Intrinsic::riscv_vlseg8_mask};
SDValue LoadOps[] = {
Chain,
DAG.getTargetConstant(VlsegIntrinsicsIds[Factor - 2], DL, XLenVT),
Passthru,
StackPtr,
+ Mask,
VL,
+ DAG.getTargetConstant(
+ RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT),
DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()), DL, XLenVT)};
unsigned Sz =
@@ -12051,7 +12063,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
}
MVT XLenVT = Subtarget.getXLenVT();
- SDValue VL = DAG.getRegister(RISCV::X0, XLenVT);
+ auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
// If the VT is larger than LMUL=8, we need to split and reassemble.
if ((VecVT.getSizeInBits().getKnownMinValue() * Factor) >
@@ -12100,10 +12112,10 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
static const Intrinsic::ID IntrIds[] = {
- Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
- Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5,
- Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
- Intrinsic::riscv_vsseg8,
+ Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+ Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+ Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+ Intrinsic::riscv_vsseg8_mask,
};
unsigned Sz =
@@ -12119,6 +12131,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
DAG.getTargetConstant(IntrIds[Factor - 2], DL, XLenVT),
StoredVal,
StackPtr,
+ Mask,
VL,
DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()),
DL, XLenVT)};
@@ -13998,6 +14011,52 @@ SDValue RISCVTargetLowering::lowerRESET_FPENV(SDValue Op,
EnvValue);
}
+const uint64_t ModeMask64 = ~RISCVExceptFlags::ALL;
+const uint32_t ModeMask32 = ~RISCVExceptFlags::ALL;
+
+SDValue RISCVTargetLowering::lowerGET_FPMODE(SDValue Op,
+ SelectionDAG &DAG) const {
+ const MVT XLenVT = Subtarget.getXLenVT();
+ SDLoc DL(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+ SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other);
+ SDValue Result = DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo);
+ Chain = Result.getValue(1);
+ return DAG.getMergeValues({Result, Chain}, DL);
+}
+
+SDValue RISCVTargetLowering::lowerSET_FPMODE(SDValue Op,
+ SelectionDAG &DAG) const {
+ const MVT XLenVT = Subtarget.getXLenVT();
+ const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32;
+ SDLoc DL(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue EnvValue = Op->getOperand(1);
+ SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+ SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT);
+
+ EnvValue = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, EnvValue);
+ EnvValue = DAG.getNode(ISD::AND, DL, XLenVT, EnvValue, ModeMask);
+ Chain = DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo,
+ ModeMask);
+ return DAG.getNode(RISCVISD::SET_CSR, DL, MVT::Other, Chain, SysRegNo,
+ EnvValue);
+}
+
+SDValue RISCVTargetLowering::lowerRESET_FPMODE(SDValue Op,
+ SelectionDAG &DAG) const {
+ const MVT XLenVT = Subtarget.getXLenVT();
+ const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32;
+ SDLoc DL(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+ SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT);
+
+ return DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo,
+ ModeMask);
+}
+
SDValue RISCVTargetLowering::lowerEH_DWARF_CFA(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -15032,10 +15091,15 @@ static SDValue combineBinOpToReduce(SDNode *N, SelectionDAG &DAG,
// Optimize (add (shl x, c0), (shl y, c1)) ->
// (SLLI (SH*ADD x, y), c0), if c1-c0 equals to [1|2|3].
+// or
+// (SLLI (QC.SHLADD x, y, c1 - c0), c0), if 4 <= (c1-c0) <=31.
static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
- // Perform this optimization only in the zba/xandesperf extension.
- if (!Subtarget.hasStdExtZba() && !Subtarget.hasVendorXAndesPerf())
+ const bool HasStdExtZba = Subtarget.hasStdExtZba();
+ const bool HasVendorXAndesPerf = Subtarget.hasVendorXAndesPerf();
+ const bool HasVendorXqciac = Subtarget.hasVendorXqciac();
+ // Perform this optimization only in the zba/xandesperf/xqciac extension.
+ if (!HasStdExtZba && !HasVendorXAndesPerf && !HasVendorXqciac)
return SDValue();
// Skip for vector types and larger types.
@@ -15060,14 +15124,22 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
if (C0 <= 0 || C1 <= 0)
return SDValue();
- // Skip if SH1ADD/SH2ADD/SH3ADD are not applicable.
- int64_t Bits = std::min(C0, C1);
int64_t Diff = std::abs(C0 - C1);
- if (Diff != 1 && Diff != 2 && Diff != 3)
+ bool IsShXaddDiff = Diff == 1 || Diff == 2 || Diff == 3;
+ bool HasShXadd = HasStdExtZba || HasVendorXAndesPerf;
+
+ // Skip if SH1ADD/SH2ADD/SH3ADD are not applicable.
+ if ((!IsShXaddDiff && HasShXadd && !HasVendorXqciac) ||
+ (IsShXaddDiff && !HasShXadd && HasVendorXqciac))
+ return SDValue();
+
+ // Skip if QC_SHLADD is not applicable.
+ if (Diff == 0 || Diff > 31)
return SDValue();
// Build nodes.
SDLoc DL(N);
+ int64_t Bits = std::min(C0, C1);
SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0);
SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0);
SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 00e969056df7..e0a8c07b4206 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -429,7 +429,7 @@ public:
bool fallBackToDAGISel(const Instruction &Inst) const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
@@ -437,14 +437,12 @@ public:
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+ IntrinsicInst *DI) const override;
bool lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
-
- bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveRes) const override;
+ Instruction *Store, Value *Mask,
+ ArrayRef<Value *> InterleaveValues) const override;
bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const override;
@@ -562,6 +560,9 @@ private:
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index b6b64b57b1b3..e23001a3a0bf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -193,7 +193,9 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
let AsmString = opcodestr # !if(!empty(argstr), "", "\t" # argstr);
let Pattern = pattern;
- let TSFlags{4-0} = format.Value;
+ InstFormat Format = format;
+
+ let TSFlags{4-0} = Format.Value;
// Defaults
RISCVVConstraint RVVConstraint = NoConstraint;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2723229859a5..64f9e3eb8d86 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2806,7 +2806,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
CASE_OPERAND_UIMM(7)
CASE_OPERAND_UIMM(8)
CASE_OPERAND_UIMM(9)
- CASE_OPERAND_UIMM(10)
+ CASE_OPERAND_UIMM(10)
CASE_OPERAND_UIMM(12)
CASE_OPERAND_UIMM(16)
CASE_OPERAND_UIMM(20)
@@ -2823,6 +2823,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
case RISCVOp::OPERAND_UIMM5_NONZERO:
Ok = isUInt<5>(Imm) && (Imm != 0);
break;
+ case RISCVOp::OPERAND_UIMM5_GT3:
+ Ok = isUInt<5>(Imm) && (Imm > 3);
+ break;
case RISCVOp::OPERAND_UIMM5_PLUS1:
Ok = (isUInt<5>(Imm) && (Imm != 0)) || (Imm == 32);
break;
@@ -4809,6 +4812,8 @@ bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
return true;
if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel)
return true;
+ if (LHS.isImm() && LHS.getImm() == 0)
+ return true;
if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
return false;
if (!LHS.isImm() || !RHS.isImm())
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index f63531a0109b..653607827282 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -120,6 +120,20 @@ def riscv_swap_csr : RVSDNode<"SWAP_CSR",
SDTCisInt<2>]>,
[SDNPHasChain]>;
+// Clear bits of CSR. The first operand is the address of the required CSR,
+// the second is the bitmask of cleared bits.
+def riscv_clear_csr : RVSDNode<"CLEAR_CSR",
+ SDTypeProfile<0, 2, [SDTCisInt<0>,
+ SDTCisInt<1>]>,
+ [SDNPHasChain]>;
+
+// Set bits of CSR. The first operand is the address of the required CSR,
+// the second is the bitmask of bits to set.
+def riscv_set_csr : RVSDNode<"SET_CSR",
+ SDTypeProfile<0, 2, [SDTCisInt<0>,
+ SDTCisInt<1>]>,
+ [SDNPHasChain]>;
+
// A read of the 64-bit counter CSR on a 32-bit target (returns (Lo, Hi)).
// It takes a chain operand and another two target constant operands (the
// CSR numbers of the low and high parts of the counter).
@@ -2038,6 +2052,42 @@ class SwapSysRegImm<SysReg SR, list<Register> Regs>
let Defs = Regs;
}
+class ClearSysReg<SysReg SR, list<Register> Regs>
+ : Pseudo<(outs), (ins GPR:$val),
+ [(riscv_clear_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>,
+ PseudoInstExpansion<(CSRRC X0, SR.Encoding, GPR:$val)> {
+ let hasSideEffects = 0;
+ let Uses = Regs;
+ let Defs = Regs;
+}
+
+class ClearSysRegImm<SysReg SR, list<Register> Regs>
+ : Pseudo<(outs), (ins uimm5:$val),
+ [(riscv_clear_csr (XLenVT SR.Encoding), uimm5:$val)]>,
+ PseudoInstExpansion<(CSRRCI X0, SR.Encoding, uimm5:$val)> {
+ let hasSideEffects = 0;
+ let Uses = Regs;
+ let Defs = Regs;
+}
+
+class SetSysReg<SysReg SR, list<Register> Regs>
+ : Pseudo<(outs), (ins GPR:$val),
+ [(riscv_set_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>,
+ PseudoInstExpansion<(CSRRS X0, SR.Encoding, GPR:$val)> {
+ let hasSideEffects = 0;
+ let Uses = Regs;
+ let Defs = Regs;
+}
+
+class SetSysRegImm<SysReg SR, list<Register> Regs>
+ : Pseudo<(outs), (ins uimm5:$val),
+ [(riscv_set_csr (XLenVT SR.Encoding), uimm5:$val)]>,
+ PseudoInstExpansion<(CSRRSI X0, SR.Encoding, uimm5:$val)> {
+ let hasSideEffects = 0;
+ let Uses = Regs;
+ let Defs = Regs;
+}
+
def ReadFRM : ReadSysReg<SysRegFRM, [FRM]>;
let hasPostISelHook = 1 in {
def WriteFRM : WriteSysReg<SysRegFRM, [FRM]>;
@@ -2056,6 +2106,10 @@ let hasPostISelHook = 1 in {
def ReadFCSR : ReadSysReg<SysRegFCSR, [FRM, FFLAGS]>;
def WriteFCSR : WriteSysReg<SysRegFCSR, [FRM, FFLAGS]>;
def WriteFCSRImm : WriteSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
+def ClearFCSR : ClearSysReg<SysRegFCSR, [FRM, FFLAGS]>;
+def ClearFCSRImm : ClearSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
+def SetFCSR : SetSysReg<SysRegFCSR, [FRM, FFLAGS]>;
+def SetFCSRImm : SetSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
}
/// Other pseudo-instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index aa9e7b5635de..aef410fb4cc6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -20,18 +20,22 @@
def simm10 : RISCVSImmLeafOp<10>;
+def SImm10UnsignedAsmOperand : SImmAsmOperand<10, "Unsigned"> {
+ let RenderMethod = "addSImm10UnsignedOperands";
+}
+
// A 10-bit signed immediate allowing range [-512, 1023]
-// but will decode to [-512, 511].
+// but represented as [-512, 511].
def simm10_unsigned : RISCVOp {
- let ParserMatchClass = SImmAsmOperand<10, "Unsigned">;
+ let ParserMatchClass = SImm10UnsignedAsmOperand;
let EncoderMethod = "getImmOpValue";
let DecoderMethod = "decodeSImmOperand<10>";
- let OperandType = "OPERAND_SIMM10_UNSIGNED";
+ let OperandType = "OPERAND_SIMM10";
let MCOperandPredicate = [{
int64_t Imm;
if (!MCOp.evaluateAsConstantImm(Imm))
return false;
- return isInt<10>(Imm) || isUInt<10>(Imm);
+ return isInt<10>(Imm);
}];
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index ec38201cd28c..522081533644 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -348,6 +348,17 @@ class NDSRVInstSDGP<bits<3> funct3, string opcodestr>
let mayStore = 1;
}
+class NDSRVInstBFHCvt<bits<7> funct7, bits<5> rs1val, DAGOperand rdty,
+ DAGOperand rs2ty, string opcodestr>
+ : RVInstR<funct7, 0b100, OPC_CUSTOM_2, (outs rdty:$rd),
+ (ins rs2ty:$rs2), opcodestr, "$rd, $rs2"> {
+ let rs1 = rs1val;
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let mayRaiseFPException = 1;
+}
+
class NDSRVInstVFPMAD<bits<6> funct6, string opcodestr>
: RVInst<(outs VR:$vd), (ins VR:$vs2, FPR32:$rs1, VMaskOp:$vm),
opcodestr # "." # "vf", "$vd, $rs1, $vs2$vm", [], InstFormatR>,
@@ -631,6 +642,19 @@ def NDS_SDGP : NDSRVInstSDGP<0b111, "nds.sdgp">;
} // Predicates = [HasVendorXAndesPerf, IsRV64]
//===----------------------------------------------------------------------===//
+// XAndesBFHCvt
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasVendorXAndesBFHCvt] in {
+def NDS_FCVT_S_BF16 : NDSRVInstBFHCvt<0b0000000, 0b00010,
+ FPR32, FPR16, "nds.fcvt.s.bf16">,
+ Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]>;
+def NDS_FCVT_BF16_S : NDSRVInstBFHCvt<0b0000000, 0b00011,
+ FPR16, FPR32, "nds.fcvt.bf16.s">,
+ Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>;
+}
+
+//===----------------------------------------------------------------------===//
// XAndesVBFHCvt
//===----------------------------------------------------------------------===//
@@ -743,6 +767,13 @@ def : Sh2AddPat<NDS_LEA_W_ZE>;
def : Sh3AddPat<NDS_LEA_D_ZE>;
} // Predicates = [HasVendorXAndesPerf, IsRV64]
+let Predicates = [HasVendorXAndesBFHCvt] in {
+def : Pat<(fpextend (bf16 FPR16:$rs)),
+ (NDS_FCVT_S_BF16 (bf16 FPR16:$rs))>;
+def : Pat<(bf16 (fpround FPR32:$rs)),
+ (NDS_FCVT_BF16_S FPR32:$rs)>;
+} // Predicates = [HasVendorXAndesBFHCvt]
+
let Predicates = [HasVendorXAndesVBFHCvt] in {
defm PseudoNDS_VFWCVT_S_BF16 : VPseudoVWCVT_S_BF16;
defm PseudoNDS_VFNCVT_BF16_S : VPseudoVNCVT_BF16_S;
@@ -801,13 +832,13 @@ defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dotsu", "PseudoNDS_VD4DOTSU",
let Predicates = [HasShortForwardBranchOpt], hasSideEffects = 0,
mayLoad = 0, mayStore = 0, Size = 8, Constraints = "$dst = $falsev" in {
def PseudoCCNDS_BFOS : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc,
+ (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
GPR:$falsev, GPR:$rs1,
uimmlog2xlen:$msb, uimmlog2xlen:$lsb), []>,
Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
ReadSFBALU]>;
def PseudoCCNDS_BFOZ : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc,
+ (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
GPR:$falsev, GPR:$rs1,
uimmlog2xlen:$msb, uimmlog2xlen:$lsb), []>,
Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 7cc7f380c3f6..c7cb6e237aea 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -845,10 +845,11 @@ let Predicates = [HasVendorXqcibi, IsRV32] in {
let Predicates = [HasVendorXqcibm, IsRV32] in {
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
def QC_INSBRI : QCIRVInstRI<0b1, simm11, "qc.insbri">;
- def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
- (ins simm5:$imm5, uimm5_plus1:$width,
+ def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+ (ins GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width,
uimm5:$shamt), "qc.insbi",
"$rd, $imm5, $width, $shamt"> {
+ let Constraints = "$rd = $rd_wb";
bits<5> imm5;
bits<5> shamt;
bits<5> width;
@@ -1336,6 +1337,22 @@ class QCISELECTIICCPat<CondCode Cond, QCISELECTIICC Inst>
: Pat<(select (i32 (setcc (i32 GPRNoX0:$rd), (i32 GPRNoX0:$rs1), Cond)), simm5:$simm1, simm5:$simm2),
(Inst GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2)>;
+class QCILICCPat<CondCode Cond, QCILICC Inst>
+ : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), (XLenVT GPRNoX0:$rs2), Cond)), simm5:$simm, (XLenVT GPRNoX0:$rd)),
+ (Inst GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm)>;
+
+class QCILICCPatInv<CondCode Cond, QCILICC Inst>
+ : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), (XLenVT GPRNoX0:$rs2), Cond)), (XLenVT GPRNoX0:$rd), simm5:$simm),
+ (Inst GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm)>;
+
+class QCILICCIPat<CondCode Cond, QCILICC Inst, DAGOperand InTyImm>
+ : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), InTyImm:$imm, Cond)), simm5:$simm, (XLenVT GPRNoX0:$rd)),
+ (Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, simm5:$simm)>;
+
+class QCILICCIPatInv<CondCode Cond, QCILICC Inst, DAGOperand InTyImm>
+ : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), InTyImm:$imm, Cond)), (XLenVT GPRNoX0:$rd), simm5:$simm),
+ (Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, simm5:$simm)>;
+
// Match `riscv_brcc` and lower to the appropriate XQCIBI branch instruction.
class BcciPat<CondCode Cond, QCIBranchInst_rii Inst, DAGOperand InTyImm>
: Pat<(riscv_brcc (i32 GPRNoX0:$rs1), InTyImm:$rs2, Cond, bb:$imm12),
@@ -1359,6 +1376,10 @@ class SelectQCbi<CondCode Cond, DAGOperand InTyImm, Pseudo OpNode >
let Predicates = [HasVendorXqciac, IsRV32] in {
def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))),
(QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>;
+def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)),
+ (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)),
+ (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
} // Predicates = [HasVendorXqciac, IsRV32]
/// Simple arithmetic operations
@@ -1417,7 +1438,7 @@ def : PatGprNoX0GprNoX0<sshlsat, QC_SHLSAT>;
/// Branches
-let Predicates = [HasVendorXqcibi, IsRV32], AddedComplexity = 2 in {
+let Predicates = [HasVendorXqcibi, IsRV32] in {
def : BcciPat<SETEQ, QC_BEQI, simm5nonzero>;
def : BcciPat<SETNE, QC_BNEI, simm5nonzero>;
def : BcciPat<SETLT, QC_BLTI, simm5nonzero>;
@@ -1445,7 +1466,7 @@ def : SelectQCbi<SETLT, simm16nonzero, Select_GPRNoX0_Using_CC_SImm16NonZero_QC>
def : SelectQCbi<SETGE, simm16nonzero, Select_GPRNoX0_Using_CC_SImm16NonZero_QC>;
def : SelectQCbi<SETULT, uimm16nonzero, Select_GPRNoX0_Using_CC_UImm16NonZero_QC>;
def : SelectQCbi<SETUGE, uimm16nonzero, Select_GPRNoX0_Using_CC_UImm16NonZero_QC>;
-} // let Predicates = [HasVendorXqcibi, IsRV32], AddedComplexity = 2
+} // let Predicates = [HasVendorXqcibi, IsRV32]
let Predicates = [HasVendorXqcibm, IsRV32] in {
def : Pat<(sext_inreg (i32 GPR:$rs1), i1), (QC_EXT GPR:$rs1, 1, 0)>;
@@ -1484,12 +1505,46 @@ def : QCIMVCCPat <SETNE, QC_MVNE>;
def : QCIMVCCPat <SETLT, QC_MVLT>;
def : QCIMVCCPat <SETULT, QC_MVLTU>;
-def : QCIMVCCIPat <SETEQ, QC_MVEQI, simm5>;
-def : QCIMVCCIPat <SETNE, QC_MVNEI, simm5>;
def : QCIMVCCIPat <SETLT, QC_MVLTI, simm5>;
def : QCIMVCCIPat <SETULT, QC_MVLTUI, uimm5>;
}
+// Prioritize Xqcics over these patterns.
+let Predicates = [HasVendorXqcicm, NoVendorXqcics, IsRV32] in {
+def : QCIMVCCIPat <SETEQ, QC_MVEQI, simm5>;
+def : QCIMVCCIPat <SETNE, QC_MVNEI, simm5>;
+}
+
+let Predicates = [HasVendorXqcicli, HasVendorXqcicsOrXqcicm, IsRV32] in {
+def : QCILICCPat <SETEQ, QC_LIEQ>;
+def : QCILICCPat <SETNE, QC_LINE>;
+def : QCILICCPat <SETLT, QC_LILT>;
+def : QCILICCPat <SETGE, QC_LIGE>;
+def : QCILICCPat <SETULT, QC_LILTU>;
+def : QCILICCPat <SETUGE, QC_LIGEU>;
+
+def : QCILICCIPat <SETEQ, QC_LIEQI, simm5>;
+def : QCILICCIPat <SETNE, QC_LINEI, simm5>;
+def : QCILICCIPat <SETLT, QC_LILTI, simm5>;
+def : QCILICCIPat <SETGE, QC_LIGEI, simm5>;
+def : QCILICCIPat <SETULT, QC_LILTUI, uimm5>;
+def : QCILICCIPat <SETUGE, QC_LIGEUI, uimm5>;
+
+def : QCILICCPatInv <SETNE, QC_LIEQ>;
+def : QCILICCPatInv <SETEQ, QC_LINE>;
+def : QCILICCPatInv <SETGE, QC_LILT>;
+def : QCILICCPatInv <SETLT, QC_LIGE>;
+def : QCILICCPatInv <SETUGE, QC_LILTU>;
+def : QCILICCPatInv <SETULT, QC_LIGEU>;
+
+def : QCILICCIPatInv <SETNE, QC_LIEQI, simm5>;
+def : QCILICCIPatInv <SETEQ, QC_LINEI, simm5>;
+def : QCILICCIPatInv <SETGE, QC_LILTI, simm5>;
+def : QCILICCIPatInv <SETLT, QC_LIGEI, simm5>;
+def : QCILICCIPatInv <SETUGE, QC_LILTUI, uimm5>;
+def : QCILICCIPatInv <SETULT, QC_LIGEUI, uimm5>;
+}
+
let Predicates = [HasVendorXqcics, IsRV32] in {
def : Pat<(select (i32 GPRNoX0:$rd), (i32 GPRNoX0:$rs2),(i32 GPRNoX0:$rs3)),
(QC_SELECTNEI GPRNoX0:$rd, (i32 0), GPRNoX0:$rs2, GPRNoX0:$rs3)>;
@@ -1498,12 +1553,8 @@ def : Pat<(select (i32 GPRNoX0:$rd), (i32 GPRNoX0:$rs2), simm5:$simm2),
def : Pat<(select (i32 GPRNoX0:$rd), simm5:$simm2,(i32 GPRNoX0:$rs2)),
(QC_SELECTIEQI GPRNoX0:$rd, (i32 0), GPRNoX0:$rs2, simm5:$simm2)>;
-// Below AddedComplexity is added to prefer these conditional select instructions over
-// conditional move instructions
-let AddedComplexity = 1 in {
def : QCISELECTCCIPat <SETEQ, QC_SELECTEQI>;
def : QCISELECTCCIPat <SETNE, QC_SELECTNEI>;
-}
def : QCISELECTICCIPat <SETEQ, QC_SELECTIEQI>;
def : QCISELECTICCIPat <SETNE, QC_SELECTINEI>;
@@ -1634,6 +1685,24 @@ def : CompressPat<(QC_E_ADDAI X2, simm10_lsb0000nonzero:$imm),
(C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>;
def : CompressPat<(QC_E_ADDI X2, X2, simm10_lsb0000nonzero:$imm),
(C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>;
+
+def : CompressPat<(QC_E_ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm),
+ (ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>;
+def : CompressPat<(QC_E_ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm),
+ (ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>;
+def : CompressPat<(QC_E_ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm),
+ (ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>;
+def : CompressPat<(QC_E_XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm),
+ (XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>;
+
+def : CompressPat<(QC_E_ADDAI GPRNoX0:$rd, simm12:$imm),
+ (ADDI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
+def : CompressPat<(QC_E_ANDAI GPRNoX0:$rd, simm12:$imm),
+ (ANDI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
+def : CompressPat<(QC_E_ORAI GPRNoX0:$rd, simm12:$imm),
+ (ORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
+def : CompressPat<(QC_E_XORAI GPRNoX0:$rd, simm12:$imm),
+ (XORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
} // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32]
let Predicates = [HasVendorXqciac, IsRV32] in {
@@ -1655,3 +1724,82 @@ def : CompressPat<(QC_E_BGEUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0
def : CompressPat<(QC_E_BLTUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
(QC_BLTUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>;
} // let isCompressOnly = true, Predicates = [HasVendorXqcibi, IsRV32]
+
+// HACKS
+// -----
+// The reasons for needing the definitions below are long and quite annoying. I'm writing
+// this so they are explained in-line, rather than anywhere else.
+//
+// Emitting an instruction to an object proceeds as:
+// - Compression (in emitInstruction)
+// - Emit to Binary Code + Fixups
+// - Assembler Relaxation
+// - Fixup evaluation/application
+// - If relaxed, re-emitted to Binary + Fixups
+// - Relocation generation from Fixups
+//
+// Unfortunately, the `QC.E.LI` -> `C.LI` compression pattern has an edge case that has
+// caused crashes in the past.
+//
+// How the bug happens is:
+// - QC.E.LI is parsed with a bare symbol, which is valid + expected, and can
+// be handled by fixups/relocations.
+// - Compression turns this into a `C.LI` because the `simm6`
+// MCOperandPredicate accepts bare symbols.
+// - Binary Code emission didn't know how to create a fixup for a CI-type
+// instruction containing a bare symbol.
+//
+// The solution to the last bullet is that we added the `fixup_riscv_rvc_imm`,
+// so that we could proceed past the last error, and then use Assembler Relaxation
+// to turn the `C.LI` with a bare symbol back into a `QC.E.LI`.
+//
+// This is good enough for emitting objects, but doesn't work for emitting
+// assembly. Emitting assembly is why we need the following Hacks.
+//
+// Emitting an instruction to assembly proceeds as:
+// - Compression (in emitInstruction)
+// - Decompression (in RISCVInstPrinter::printInst)
+// - InstAliases are applied
+//
+// So in the case of `QC.E.LI` with a bare symbol, first it is compressed to
+// `C.LI` with a bare symbol, and then it is decompressed to `ADDI` with a bare
+// symbol for printing, which is printed via an alias as `li <reg>, <symbol>`.
+// Both the decompression and the alias use the MCOperandPredicate from
+// `simm12`, which accepts bare symbols.
+//
+// The problem here is that `li <reg>, <symbol>` fails to parse, because the
+// parsers do not accept bare symbols, they only accept symbols with specifiers
+// or immediates.
+//
+// Our solution is to add another alias, which will be prioritised above the
+// `li` alias, but only when `qc.e.li` is available. We originally intended to
+// use the `bare_symbol` Operand type, but this had no MCOperandPredicate, and
+// adding one changed the error messages when parsing `qc.e.li` with a
+// too-large constant. So instead, we add a new `AsmOperand` and `Operand` type,
+// just for the alias, which parse just like a BareSymbol, but they
+// have both an MCOperandPredicate, and the error message that corresponds to
+// the existing one on `qc.e.li` for too-large immediates (which fail to parse
+// as both an immediate, and a bare symbol).
+//
+// This is fairly unpleasant, but it's the least disruptive thing we can do
+// and keeps all the hacks confined to the RISC-V backend code.
+
+def BareSymbolQC_E_LI : AsmOperandClass {
+ let Name = "BareSymbolQC_E_LI";
+ let PredicateMethod = "isBareSymbol";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidBareSymbolQC_E_LI";
+ let ParserMethod = "parseBareSymbol";
+}
+
+def hack_bare_symbol_qc_e_li : Operand<XLenVT> {
+ let ParserMatchClass = BareSymbolQC_E_LI;
+ let MCOperandPredicate = [{
+ return MCOp.isExpr() && MCOp.isBareSymbolRef();
+ }];
+}
+
+let Predicates = [HasVendorXqcili, IsRV32] in {
+def : InstAlias<"qc.e.li $rd, $sym", (ADDI GPR:$rd, X0, hack_bare_symbol_qc_e_li:$sym), 3>;
+} // Predicates = [HasVendorXqcili, IsRV32]
+// END HACKS
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
index 878b85b14157..0723b2f568a7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
@@ -41,6 +41,7 @@ class Prefetch_ri<bits<5> optype, string opcodestr>
opcodestr, "${imm12}(${rs1})"> {
let Inst{11-7} = 0b00000;
let rs2 = optype;
+ let Format = InstFormatOther; // this does not follow the normal S format.
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index a6ff22c4b391..dd68a5556cdb 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -14,6 +14,7 @@
#include "RISCVISelLowering.h"
#include "RISCVSubtarget.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
@@ -68,6 +69,89 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = {
Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
Intrinsic::riscv_vlseg8_mask};
+static const Intrinsic::ID FixedVssegIntrIds[] = {
+ Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
+ Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
+ Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
+ Intrinsic::riscv_seg8_store_mask};
+
+static const Intrinsic::ID ScalableVssegIntrIds[] = {
+ Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+ Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+ Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+ Intrinsic::riscv_vsseg8_mask};
+
+static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
+ assert(N);
+ if (N == 1)
+ return true;
+
+ using namespace PatternMatch;
+ // Right now we're only recognizing the simplest pattern.
+ uint64_t C;
+ if (match(V, m_CombineOr(m_ConstantInt(C),
+ m_NUWMul(m_Value(), m_ConstantInt(C)))) &&
+ C && C % N == 0)
+ return true;
+
+ if (isPowerOf2_32(N)) {
+ KnownBits KB = llvm::computeKnownBits(V, DL);
+ return KB.countMinTrailingZeros() >= Log2_32(N);
+ }
+
+ return false;
+}
+
+/// Do the common operand retrieval and validition required by the
+/// routines below.
+static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
+ Instruction *I, Value *&Ptr, Value *&Mask,
+ Value *&VL, Align &Alignment) {
+
+ IRBuilder<> Builder(I);
+ const DataLayout &DL = I->getDataLayout();
+ ElementCount EC = VTy->getElementCount();
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ assert(LI->isSimple());
+ Ptr = LI->getPointerOperand();
+ Alignment = LI->getAlign();
+ assert(!Mask && "Unexpected mask on a load");
+ Mask = Builder.getAllOnesMask(EC);
+ VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC)
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(I)) {
+ assert(SI->isSimple());
+ Ptr = SI->getPointerOperand();
+ Alignment = SI->getAlign();
+ assert(!Mask && "Unexpected mask on a store");
+ Mask = Builder.getAllOnesMask(EC);
+ VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC)
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ auto *VPLdSt = cast<VPIntrinsic>(I);
+ assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
+ VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
+ "Unexpected intrinsic");
+ Ptr = VPLdSt->getMemoryPointerParam();
+ Alignment = VPLdSt->getPointerAlignment().value_or(
+ DL.getABITypeAlign(VTy->getElementType()));
+
+ assert(Mask && "vp.load and vp.store needs a mask!");
+
+ Value *WideEVL = VPLdSt->getVectorLengthParam();
+ // Conservatively check if EVL is a multiple of factor, otherwise some
+ // (trailing) elements might be lost after the transformation.
+ if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
+ return false;
+
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+ return true;
+}
+
/// Lower an interleaved load into a vlsegN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
@@ -81,21 +165,25 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Indices.size() == Shuffles.size());
- IRBuilder<> Builder(LI);
-
- const DataLayout &DL = LI->getDataLayout();
+ IRBuilder<> Builder(Load);
+ const DataLayout &DL = Load->getDataLayout();
auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
- if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
- LI->getPointerAddressSpace(), DL))
+ auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+
+ Value *Ptr, *VL;
+ Align Alignment;
+ if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
return false;
- auto *PtrTy = LI->getPointerOperandType();
- auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+ Type *PtrTy = Ptr->getType();
+ unsigned AS = PtrTy->getPointerAddressSpace();
+ if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+ return false;
// If the segment load is going to be performed segment at a time anyways
// and there's only one element used, use a strided load instead. This
@@ -104,25 +192,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
- Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
- Value *VL = Builder.getInt32(VTy->getNumElements());
-
+ Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
+ // Note: Same VL as above, but i32 not xlen due to signature of
+ // vp.strided.load
+ VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+ VTy->getElementCount());
CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
{BasePtr, Stride, Mask, VL});
- CI->addParamAttr(
- 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
+ CI->addParamAttr(0,
+ Attribute::getWithAlignment(CI->getContext(), Alignment));
Shuffles[0]->replaceAllUsesWith(CI);
return true;
};
- Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
- Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
CallInst *VlsegN = Builder.CreateIntrinsic(
- FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy},
- {LI->getPointerOperand(), Mask, VL});
+ FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
for (unsigned i = 0; i < Shuffles.size(); i++) {
Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
@@ -132,18 +218,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
return true;
}
-static const Intrinsic::ID FixedVssegIntrIds[] = {
- Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
- Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
- Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
- Intrinsic::riscv_seg8_store_mask};
-
-static const Intrinsic::ID ScalableVssegIntrIds[] = {
- Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
- Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
- Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
- Intrinsic::riscv_vsseg8_mask};
-
/// Lower an interleaved store into a vssegN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
@@ -191,7 +265,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
- Value *VL = Builder.getInt32(VTy->getNumElements());
+ Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+ VTy->getElementCount());
CallInst *CI = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_store,
@@ -223,7 +298,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
// This VL should be OK (should be executable in one vsseg instruction,
// potentially under larger LMULs) because we checked that the fixed vector
// type fits in isLegalInterleavedAccessType
- Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
+ Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
Ops.append({SI->getPointerOperand(), StoreMask, VL});
@@ -233,58 +308,57 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
- LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const {
- const unsigned Factor = DeinterleaveValues.size();
+ Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+ const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
if (Factor > 8)
return false;
- assert(LI->isSimple());
- IRBuilder<> Builder(LI);
+ IRBuilder<> Builder(Load);
- Value *FirstActive =
- *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
- VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
+ VectorType *ResVTy = getDeinterleavedVectorType(DI);
- const DataLayout &DL = LI->getDataLayout();
+ const DataLayout &DL = Load->getDataLayout();
+ auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
- if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
- LI->getPointerAddressSpace(), DL))
+ Value *Ptr, *VL;
+ Align Alignment;
+ if (!getMemOperands(Factor, ResVTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
return false;
- Value *Return;
- Type *PtrTy = LI->getPointerOperandType();
- Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+ Type *PtrTy = Ptr->getType();
+ unsigned AS = PtrTy->getPointerAddressSpace();
+ if (!isLegalInterleavedAccessType(ResVTy, Factor, Alignment, AS, DL))
+ return false;
- if (auto *FVTy = dyn_cast<FixedVectorType>(ResVTy)) {
- Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
- Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount());
+ Value *Return;
+ if (isa<FixedVectorType>(ResVTy)) {
Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
- {ResVTy, PtrTy, XLenTy},
- {LI->getPointerOperand(), Mask, VL});
+ {ResVTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
} else {
- static const Intrinsic::ID IntrIds[] = {
- Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
- Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
- Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
- Intrinsic::riscv_vlseg8};
-
unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType());
unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
- LI->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(LI->getContext()),
+ Load->getContext(), "riscv.vector.tuple",
+ ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
NumElts * SEW / 8),
Factor);
+ Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+ Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
+ {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
- Value *VL = Constant::getAllOnesValue(XLenTy);
+ Value *Operands[] = {
+ PoisonValue::get(VecTupTy),
+ Ptr,
+ Mask,
+ VL,
+ ConstantInt::get(XLenTy,
+ RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
+ ConstantInt::get(XLenTy, Log2_64(SEW))};
- Value *Vlseg = Builder.CreateIntrinsic(
- IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy},
- {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL,
- ConstantInt::get(XLenTy, Log2_64(SEW))});
+ CallInst *Vlseg = Builder.CreateCall(VlsegNFunc, Operands);
SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
- Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes));
+ Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
for (unsigned i = 0; i < Factor; ++i) {
Value *VecExtract = Builder.CreateIntrinsic(
Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy},
@@ -293,217 +367,61 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
}
- for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
- if (!DIV)
- continue;
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIV->replaceAllUsesWith(NewEV);
- }
-
+ DI->replaceAllUsesWith(Return);
return true;
}
bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
- StoreInst *SI, ArrayRef<Value *> InterleaveValues) const {
+ Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const {
unsigned Factor = InterleaveValues.size();
if (Factor > 8)
return false;
- assert(SI->isSimple());
- IRBuilder<> Builder(SI);
+ IRBuilder<> Builder(Store);
auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType());
- auto *PtrTy = SI->getPointerOperandType();
- const DataLayout &DL = SI->getDataLayout();
+ const DataLayout &DL = Store->getDataLayout();
+ Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
- if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
- SI->getPointerAddressSpace(), DL))
+ Value *Ptr, *VL;
+ Align Alignment;
+ if (!getMemOperands(Factor, InVTy, XLenTy, Store, Ptr, Mask, VL, Alignment))
+ return false;
+ Type *PtrTy = Ptr->getType();
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL))
return false;
- Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
-
- if (auto *FVTy = dyn_cast<FixedVectorType>(InVTy)) {
+ if (isa<FixedVectorType>(InVTy)) {
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
- SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy});
-
+ Store->getModule(), FixedVssegIntrIds[Factor - 2],
+ {InVTy, PtrTy, XLenTy});
SmallVector<Value *, 10> Ops(InterleaveValues);
- Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
- Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount());
- Ops.append({SI->getPointerOperand(), Mask, VL});
-
+ Ops.append({Ptr, Mask, VL});
Builder.CreateCall(VssegNFunc, Ops);
- } else {
- static const Intrinsic::ID IntrIds[] = {
- Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
- Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5,
- Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
- Intrinsic::riscv_vsseg8};
-
- unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
- unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
- Type *VecTupTy = TargetExtType::get(
- SI->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(SI->getContext()),
- NumElts * SEW / 8),
- Factor);
-
- Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
- SI->getModule(), IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy});
-
- Value *VL = Constant::getAllOnesValue(XLenTy);
-
- Value *StoredVal = PoisonValue::get(VecTupTy);
- for (unsigned i = 0; i < Factor; ++i)
- StoredVal = Builder.CreateIntrinsic(
- Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy},
- {StoredVal, InterleaveValues[i], Builder.getInt32(i)});
-
- Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL,
- ConstantInt::get(XLenTy, Log2_64(SEW))});
- }
-
- return true;
-}
-
-static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
- assert(N);
- if (N == 1)
- return true;
-
- using namespace PatternMatch;
- // Right now we're only recognizing the simplest pattern.
- uint64_t C;
- if (match(V, m_CombineOr(m_ConstantInt(C),
- m_c_Mul(m_Value(), m_ConstantInt(C)))) &&
- C && C % N == 0)
return true;
-
- if (isPowerOf2_32(N)) {
- KnownBits KB = llvm::computeKnownBits(V, DL);
- return KB.countMinTrailingZeros() >= Log2_32(N);
}
+ unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
+ unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
+ Type *VecTupTy = TargetExtType::get(
+ Store->getContext(), "riscv.vector.tuple",
+ ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
+ NumElts * SEW / 8),
+ Factor);
- return false;
-}
-
-/// Lower an interleaved vp.load into a vlsegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.load (Factor = 2):
-/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
-/// %mask,
-/// i32 %wide.rvl)
-/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-/// @llvm.vector.deinterleave2.nxv64i8(
-/// <vscale x 64 x i8> %l)
-/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
-/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
-///
-/// Into:
-/// %rvl = udiv %wide.rvl, 2
-/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
-/// <vscale x 32 x i8> undef,
-/// ptr %ptr,
-/// %mask,
-/// i64 %rvl,
-/// i64 1)
-/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
-/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
-///
-/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
-/// removed by the caller
-/// TODO: We probably can loosen the dependency on matching extractvalue when
-/// dealing with factor of 2 (extractvalue is still required for most of other
-/// factors though).
-bool RISCVTargetLowering::lowerInterleavedVPLoad(
- VPIntrinsic *Load, Value *Mask,
- ArrayRef<Value *> DeinterleaveResults) const {
- const unsigned Factor = DeinterleaveResults.size();
- assert(Mask && "Expect a valid mask");
- assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
- "Unexpected intrinsic");
-
- Value *FirstActive = *llvm::find_if(DeinterleaveResults,
- [](Value *V) { return V != nullptr; });
- VectorType *VTy = cast<VectorType>(FirstActive->getType());
-
- auto &DL = Load->getModule()->getDataLayout();
- Align Alignment = Load->getParamAlign(0).value_or(
- DL.getABITypeAlign(VTy->getElementType()));
- if (!isLegalInterleavedAccessType(
- VTy, Factor, Alignment,
- Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
- return false;
-
- IRBuilder<> Builder(Load);
-
- Value *WideEVL = Load->getVectorLengthParam();
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
- return false;
-
- auto *PtrTy = Load->getArgOperand(0)->getType();
- auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
- Value *EVL = Builder.CreateZExt(
- Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
- XLenTy);
-
- Value *Return = nullptr;
- if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
- Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
- {FVTy, PtrTy, XLenTy},
- {Load->getArgOperand(0), Mask, EVL});
- } else {
- unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
- unsigned NumElts = VTy->getElementCount().getKnownMinValue();
- Type *VecTupTy = TargetExtType::get(
- Load->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
- NumElts * SEW / 8),
- Factor);
-
- Value *PoisonVal = PoisonValue::get(VecTupTy);
-
- Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
- Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
- {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
- Value *Operands[] = {
- PoisonVal,
- Load->getArgOperand(0),
- Mask,
- EVL,
- ConstantInt::get(XLenTy,
- RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
- ConstantInt::get(XLenTy, Log2_64(SEW))};
-
- CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
-
- SmallVector<Type *, 8> AggrTypes{Factor, VTy};
- Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
- Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
- Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
- for (unsigned i = 0; i < Factor; ++i) {
- Value *VecExtract =
- Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
- Return = Builder.CreateInsertValue(Return, VecExtract, i);
- }
- }
+ Value *StoredVal = PoisonValue::get(VecTupTy);
+ for (unsigned i = 0; i < Factor; ++i)
+ StoredVal = Builder.CreateIntrinsic(
+ Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy},
+ {StoredVal, InterleaveValues[i], Builder.getInt32(i)});
- for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
- if (!DIO)
- continue;
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIO->replaceAllUsesWith(NewEV);
- }
+ Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
+ Store->getModule(), ScalableVssegIntrIds[Factor - 2],
+ {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
+ Value *Operands[] = {StoredVal, Ptr, Mask, VL,
+ ConstantInt::get(XLenTy, Log2_64(SEW))};
+ Builder.CreateCall(VssegNFunc, Operands);
return true;
}
@@ -557,15 +475,15 @@ bool RISCVTargetLowering::lowerInterleavedVPStore(
auto *PtrTy = Store->getArgOperand(1)->getType();
auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
- Value *EVL = Builder.CreateZExt(
- Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
- XLenTy);
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ Value *EVL =
+ Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
- if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+ if (isa<FixedVectorType>(VTy)) {
SmallVector<Value *, 8> Operands(InterleaveOperands);
Operands.append({Store->getArgOperand(1), Mask, EVL});
Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
- {FVTy, PtrTy, XLenTy}, Operands);
+ {VTy, PtrTy, XLenTy}, Operands);
return true;
}
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index d257f56cf412..28d64031f8bc 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -123,7 +123,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
SmallSet<std::pair<const MachineInstr *, unsigned>, 4> Visited;
SmallVector<std::pair<const MachineInstr *, unsigned>, 4> Worklist;
- Worklist.push_back(std::make_pair(&OrigMI, OrigBits));
+ Worklist.emplace_back(&OrigMI, OrigBits);
while (!Worklist.empty()) {
auto P = Worklist.pop_back_val();
@@ -158,7 +158,6 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
case RISCV::MULW:
case RISCV::REMUW:
case RISCV::REMW:
- case RISCV::SLLIW:
case RISCV::SLLW:
case RISCV::SRAIW:
case RISCV::SRAW:
@@ -188,6 +187,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
if (Bits >= 32)
break;
return false;
+
case RISCV::SEXT_B:
case RISCV::PACKH:
if (Bits >= 8)
@@ -213,7 +213,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
// as an N-Bit user.
unsigned ShAmt = UserMI->getOperand(2).getImm();
if (Bits > ShAmt) {
- Worklist.push_back(std::make_pair(UserMI, Bits - ShAmt));
+ Worklist.emplace_back(UserMI, Bits - ShAmt);
break;
}
return false;
@@ -225,21 +225,29 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
unsigned ShAmt = UserMI->getOperand(2).getImm();
if (Bits >= (ST.getXLen() - ShAmt))
break;
- Worklist.push_back(std::make_pair(UserMI, Bits + ShAmt));
+ Worklist.emplace_back(UserMI, Bits + ShAmt);
+ break;
+ }
+ case RISCV::SLLIW: {
+ unsigned ShAmt = UserMI->getOperand(2).getImm();
+ if (Bits >= 32 - ShAmt)
+ break;
+ Worklist.emplace_back(UserMI, Bits + ShAmt);
break;
}
+
case RISCV::ANDI: {
uint64_t Imm = UserMI->getOperand(2).getImm();
if (Bits >= (unsigned)llvm::bit_width(Imm))
break;
- Worklist.push_back(std::make_pair(UserMI, Bits));
+ Worklist.emplace_back(UserMI, Bits);
break;
}
case RISCV::ORI: {
uint64_t Imm = UserMI->getOperand(2).getImm();
if (Bits >= (unsigned)llvm::bit_width<uint64_t>(~Imm))
break;
- Worklist.push_back(std::make_pair(UserMI, Bits));
+ Worklist.emplace_back(UserMI, Bits);
break;
}
@@ -253,7 +261,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
break;
return false;
}
- Worklist.push_back(std::make_pair(UserMI, Bits));
+ Worklist.emplace_back(UserMI, Bits);
break;
case RISCV::SRA:
@@ -272,7 +280,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
// Operand 1 is implicitly zero extended.
if (OpIdx == 1 && Bits >= 32)
break;
- Worklist.push_back(std::make_pair(UserMI, Bits));
+ Worklist.emplace_back(UserMI, Bits);
break;
case RISCV::BEXTI:
@@ -320,13 +328,13 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
case RISCV::BSETI:
case RISCV::BCLRI:
case RISCV::BINVI:
- Worklist.push_back(std::make_pair(UserMI, Bits));
+ Worklist.emplace_back(UserMI, Bits);
break;
case RISCV::BREV8:
case RISCV::ORC_B:
// BREV8 and ORC_B work on bytes. Round Bits down to the nearest byte.
- Worklist.push_back(std::make_pair(UserMI, alignDown(Bits, 8)));
+ Worklist.emplace_back(UserMI, alignDown(Bits, 8));
break;
case RISCV::PseudoCCMOVGPR:
@@ -336,7 +344,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
// of operand 4 and 5 is used.
if (OpIdx != 4 && OpIdx != 5)
return false;
- Worklist.push_back(std::make_pair(UserMI, Bits));
+ Worklist.emplace_back(UserMI, Bits);
break;
case RISCV::CZERO_EQZ:
@@ -345,7 +353,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
case RISCV::VT_MASKCN:
if (OpIdx != 1)
return false;
- Worklist.push_back(std::make_pair(UserMI, Bits));
+ Worklist.emplace_back(UserMI, Bits);
break;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
index da0ceee0c084..5ef858a787c7 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
@@ -54,6 +54,12 @@ def : WriteRes<WriteShiftImm32, [Andes45ALU]>;
def : WriteRes<WriteShiftReg, [Andes45ALU]>;
def : WriteRes<WriteShiftReg32, [Andes45ALU]>;
+// Short forward branch
+def : WriteRes<WriteSFB, [Andes45ALU]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
// Branching
def : WriteRes<WriteJmp, [Andes45ALU]>;
def : WriteRes<WriteJal, [Andes45ALU]>;
@@ -231,6 +237,8 @@ def : ReadAdvance<ReadShiftImm, 0>;
def : ReadAdvance<ReadShiftImm32, 0>;
def : ReadAdvance<ReadShiftReg, 0>;
def : ReadAdvance<ReadShiftReg32, 0>;
+def : ReadAdvance<ReadSFBJmp, 0>;
+def : ReadAdvance<ReadSFBALU, 0>;
def : ReadAdvance<ReadJalr, 0>;
def : ReadAdvance<ReadJmp, 0>;
def : ReadAdvance<ReadIMul, 0>;
@@ -328,7 +336,6 @@ def : ReadAdvance<ReadCSR, 0>;
//===----------------------------------------------------------------------===//
// Unsupported extensions
defm : UnsupportedSchedQ;
-defm : UnsupportedSchedSFB;
defm : UnsupportedSchedV;
defm : UnsupportedSchedZabha;
defm : UnsupportedSchedZbkb;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 05388f2d1311..3e286a754e4e 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -13,6 +13,17 @@
//
//===----------------------------------------------------------------------===//
+class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ bit c = !eq(mx, LLMUL);
+}
+
+class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ int SSEW = SmallestSEW<mx, isF>.r;
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
def SpacemitX60Model : SchedMachineModel {
let IssueWidth = 2; // dual-issue
let MicroOpBufferSize = 0; // in-order
@@ -44,6 +55,19 @@ let BufferSize = 0 in {
// floating point instructions, this model assumes single issue as
// increasing it reduces the gains we saw in performance
def SMX60_FP : ProcResource<1>;
+
+ // Vector pipeline
+ // Single issue for vector store/load instructions
+ def SMX60_VLS : ProcResource<1>;
+
+ // The C908 user manual says: "Vector floating-point units support vector
+ // floating-point computation of different bits. In addition, vector integer
+ // units are added". Developer confirmed it's a separate VIEU
+ def SMX60_VIEU : ProcResource<1>;
+
+ // The C908 user manual says: "The vector execution unit is developed by
+ // extending the floating-point unit", so let's assume single issue for now
+ def SMX60_VFP : ProcResource<1>;
}
//===----------------------------------------------------------------------===//
@@ -232,9 +256,341 @@ let Latency = 4 in {
def : WriteRes<WriteFMovI32ToF32, [SMX60_IEU]>;
}
+// 6. Configuration-Setting Instructions
+def : WriteRes<WriteVSETVLI, [SMX60_IEUA]>;
+def : WriteRes<WriteVSETIVLI, [SMX60_IEUA]>;
+def : WriteRes<WriteVSETVL, [SMX60_IEUA]>;
+
+// 7. Vector Loads and Stores
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ // Unit-stride loads and stores
+ defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>;
+
+ // Mask loads and stores
+ defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
+ defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
+
+ // Strided and indexed loads and stores
+ foreach eew = [8, 16, 32, 64] in {
+ defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+ }
+}
+
+// Segmented loads and stores
+foreach mx = SchedMxList in {
+ foreach nf=2-8 in {
+ foreach eew = [8, 16, 32, 64] in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ // Unit-stride segmented
+ defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+ // Strided/indexed segmented
+ defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+ // Indexed segmented
+ defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+ }
+ }
+}
+
+// Whole register move/load/store
+foreach LMul = [1, 2, 4, 8] in {
+ def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>;
+ def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>;
+
+ def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>;
+}
+
+// 11. Vector Integer Arithmetic Instructions
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// Vector Integer Division and Remainder
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+}
+
+// Narrowing Shift and Clips
+foreach mx = SchedMxListW in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 13. Vector Floating-Point Instructions
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, isF=1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, isF=1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+ foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListFW in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
+
+ defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+foreach mx = SchedMxListFW in {
+ foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+// Narrowing
+foreach mx = SchedMxListW in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+ defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+foreach mx = SchedMxListFW in {
+ foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+// Vector Floating-Point Division and Square Root
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+// 14. Vector Reduction Operations
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListWRed in {
+ foreach sew = SchedSEWSet<mx, 0, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListFWRed in {
+ foreach sew = SchedSEWSet<mx, 1, 1>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+ }
+}
+
+// 15. Vector Mask Instructions
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 16. Vector Permutation Instructions
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
+def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
+
+def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
+def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
+
+// Gather and Compress
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxList in {
+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+ defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
// Others
def : WriteRes<WriteCSR, [SMX60_IEU]>;
def : WriteRes<WriteNop, [SMX60_IEU]>;
+def : WriteRes<WriteRdVLENB, [SMX60_IEUA]>;
//===----------------------------------------------------------------------===//
// Bypass and advance
@@ -341,10 +697,184 @@ def : ReadAdvance<ReadCLMUL, 0>;
def : ReadAdvance<ReadSingleBit, 0>;
def : ReadAdvance<ReadSingleBitImm, 0>;
+// 6. Configuration-Setting Instructions
+def : ReadAdvance<ReadVSETVLI, 0>;
+def : ReadAdvance<ReadVSETVL, 0>;
+
+// 7. Vector Loads and Stores
+def : ReadAdvance<ReadVLDX, 0>;
+def : ReadAdvance<ReadVSTX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTEV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTM", 0>;
+def : ReadAdvance<ReadVLDSX, 0>;
+def : ReadAdvance<ReadVSTSX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTS8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS64V", 0>;
+defm "" : LMULReadAdvance<"ReadVLDUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVLDOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVST1R, 0>;
+def : ReadAdvance<ReadVST2R, 0>;
+def : ReadAdvance<ReadVST4R, 0>;
+def : ReadAdvance<ReadVST8R, 0>;
+
+// 12. Vector Integer Arithmetic Instructions
+defm : LMULReadAdvance<"ReadVIALUV", 0>;
+defm : LMULReadAdvance<"ReadVIALUX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUX", 0>;
+defm : LMULReadAdvance<"ReadVExtV", 0>;
+defm : LMULReadAdvance<"ReadVICALUV", 0>;
+defm : LMULReadAdvance<"ReadVICALUX", 0>;
+defm : LMULReadAdvance<"ReadVShiftV", 0>;
+defm : LMULReadAdvance<"ReadVShiftX", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftV", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftX", 0>;
+defm : LMULReadAdvance<"ReadVICmpV", 0>;
+defm : LMULReadAdvance<"ReadVICmpX", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxV", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxX", 0>;
+defm : LMULReadAdvance<"ReadVIMulV", 0>;
+defm : LMULReadAdvance<"ReadVIMulX", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivV", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulX", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddV", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>;
+defm : LMULReadAdvance<"ReadVIMergeV", 0>;
+defm : LMULReadAdvance<"ReadVIMergeX", 0>;
+defm : LMULReadAdvance<"ReadVIMovV", 0>;
+defm : LMULReadAdvance<"ReadVIMovX", 0>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+defm "" : LMULReadAdvance<"ReadVSALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVSALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulV", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulX", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftV", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftX", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>;
+
+// 14. Vector Floating-Point Instructions
+defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>;
+defm "" : LMULReadAdvance<"ReadVFClassV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMovF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
+
+// 15. Vector Reduction Operations
+def : ReadAdvance<ReadVIRedV, 0>;
+def : ReadAdvance<ReadVIRedV0, 0>;
+def : ReadAdvance<ReadVIWRedV, 0>;
+def : ReadAdvance<ReadVIWRedV0, 0>;
+def : ReadAdvance<ReadVFRedV, 0>;
+def : ReadAdvance<ReadVFRedV0, 0>;
+def : ReadAdvance<ReadVFRedOV, 0>;
+def : ReadAdvance<ReadVFRedOV0, 0>;
+def : ReadAdvance<ReadVFWRedV, 0>;
+def : ReadAdvance<ReadVFWRedV0, 0>;
+def : ReadAdvance<ReadVFWRedOV, 0>;
+def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+// 16. Vector Mask Instructions
+defm "" : LMULReadAdvance<"ReadVMALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVMPopV", 0>;
+defm "" : LMULReadAdvance<"ReadVMFFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVMSFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVIotaV", 0>;
+
+// 17. Vector Permutation Instructions
+def : ReadAdvance<ReadVMovXS, 0>;
+def : ReadAdvance<ReadVMovSX_V, 0>;
+def : ReadAdvance<ReadVMovSX_X, 0>;
+def : ReadAdvance<ReadVMovFS, 0>;
+def : ReadAdvance<ReadVMovSF_V, 0>;
+def : ReadAdvance<ReadVMovSF_F, 0>;
+defm "" : LMULReadAdvance<"ReadVISlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVISlideX", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVMov1V, 0>;
+def : ReadAdvance<ReadVMov2V, 0>;
+def : ReadAdvance<ReadVMov4V, 0>;
+def : ReadAdvance<ReadVMov8V, 0>;
+
+// Others
+def : ReadAdvance<ReadVMask, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
+foreach mx = SchedMxList in {
+ def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
+ foreach sew = SchedSEWSet<mx>.val in
+ def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx # "_E" # sew), 0>;
+}
+
//===----------------------------------------------------------------------===//
// Unsupported extensions
defm : UnsupportedSchedQ;
-defm : UnsupportedSchedV;
defm : UnsupportedSchedZabha;
defm : UnsupportedSchedZbkb;
defm : UnsupportedSchedZbkx;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 2d9f38221d42..e656e8bb99d8 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -747,6 +747,14 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
}
+ // Vector Register Gather with 16-bit Index Elements Instruction
+ // Dest and source data EEW=SEW. Index vector EEW=16.
+ case RISCV::VRGATHEREI16_VV: {
+ if (MO.getOperandNo() == 2)
+ return 4;
+ return MILog2SEW;
+ }
+
default:
return std::nullopt;
}
@@ -966,6 +974,13 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VADC_VIM:
case RISCV::VADC_VVM:
case RISCV::VADC_VXM:
+ case RISCV::VMADC_VIM:
+ case RISCV::VMADC_VVM:
+ case RISCV::VMADC_VXM:
+ case RISCV::VSBC_VVM:
+ case RISCV::VSBC_VXM:
+ case RISCV::VMSBC_VVM:
+ case RISCV::VMSBC_VXM:
// Vector Widening Integer Multiply-Add Instructions
case RISCV::VWMACCU_VV:
case RISCV::VWMACCU_VX:
@@ -1051,6 +1066,11 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VSLIDEDOWN_VI:
case RISCV::VSLIDE1UP_VX:
case RISCV::VFSLIDE1UP_VF:
+ // Vector Register Gather Instructions
+ case RISCV::VRGATHER_VI:
+ case RISCV::VRGATHER_VV:
+ case RISCV::VRGATHER_VX:
+ case RISCV::VRGATHEREI16_VV:
// Vector Single-Width Floating-Point Add/Subtract Instructions
case RISCV::VFADD_VF:
case RISCV::VFADD_VV:
@@ -1132,6 +1152,8 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VMFLE_VV:
case RISCV::VMFGT_VF:
case RISCV::VMFGE_VF:
+ // Vector Floating-Point Classify Instruction
+ case RISCV::VFCLASS_V:
// Vector Floating-Point Merge Instruction
case RISCV::VFMERGE_VFM:
// Vector Floating-Point Move Instruction
@@ -1346,9 +1368,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
const MachineInstr &UserMI = *UserOp.getParent();
LLVM_DEBUG(dbgs() << " Checking user: " << UserMI << "\n");
- if (UserMI.isCopy() && UserMI.getOperand(0).getReg().isVirtual() &&
- UserMI.getOperand(0).getSubReg() == RISCV::NoSubRegister &&
- UserMI.getOperand(1).getSubReg() == RISCV::NoSubRegister) {
+ if (UserMI.isFullCopy() && UserMI.getOperand(0).getReg().isVirtual()) {
LLVM_DEBUG(dbgs() << " Peeking through uses of COPY\n");
Worklist.insert_range(llvm::make_pointer_range(
MRI->use_operands(UserMI.getOperand(0).getReg())));
diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
index be54a8c95a97..3bd2705f021a 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
@@ -10,6 +10,10 @@
// instructions and masked instructions, so that we can reduce the live range
// overlaps of mask registers.
//
+// If there are multiple masks producers followed by multiple masked
+// instructions, then at each masked instructions add dependency edges between
+// every producer and masked instruction.
+//
// The reason why we need to do this:
// 1. When tracking register pressure, we don't track physical registers.
// 2. We have a RegisterClass for mask register (which is `VMV0`), but we don't
@@ -40,9 +44,8 @@
namespace llvm {
static bool isCopyToV0(const MachineInstr &MI) {
- return MI.isCopy() && MI.getOperand(0).getReg() == RISCV::V0 &&
- MI.getOperand(1).getReg().isVirtual() &&
- MI.getOperand(1).getSubReg() == RISCV::NoSubRegister;
+ return MI.isFullCopy() && MI.getOperand(0).getReg() == RISCV::V0 &&
+ MI.getOperand(1).getReg().isVirtual();
}
static bool isSoleUseCopyToV0(SUnit &SU) {
@@ -68,11 +71,27 @@ public:
void apply(ScheduleDAGInstrs *DAG) override {
SUnit *NearestUseV0SU = nullptr;
+ SmallVector<SUnit *, 2> DefMask;
for (SUnit &SU : DAG->SUnits) {
const MachineInstr *MI = SU.getInstr();
- if (MI->findRegisterUseOperand(RISCV::V0, TRI))
+ bool UseV0 = MI->findRegisterUseOperand(RISCV::V0, TRI);
+ if (isSoleUseCopyToV0(SU) && !UseV0)
+ DefMask.push_back(&SU);
+
+ if (UseV0) {
NearestUseV0SU = &SU;
+ // Copy may not be a real use, so skip it here.
+ if (DefMask.size() > 1 && !MI->isCopy()) {
+ for (SUnit *Def : DefMask)
+ if (DAG->canAddEdge(Def, &SU))
+ DAG->addEdge(Def, SDep(&SU, SDep::Artificial));
+ }
+
+ if (!DefMask.empty())
+ DefMask.erase(DefMask.begin());
+ }
+
if (NearestUseV0SU && NearestUseV0SU != &SU && isSoleUseCopyToV0(SU) &&
// For LMUL=8 cases, there will be more possibilities to spill.
// FIXME: We should use RegPressureTracker to do fine-grained
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 2a424e673ddf..a7f6fbceffc3 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -19,7 +19,6 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 6897865eb4e1..ea78dcd13526 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1364,7 +1364,24 @@ defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, Subgro
defm : DemangledGetBuiltin<"get_sub_group_le_mask", OpenCL_std, Variable, SubgroupLeMask>;
defm : DemangledGetBuiltin<"get_sub_group_lt_mask", OpenCL_std, Variable, SubgroupLtMask>;
defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalLinearId", OpenCL_std, Variable, GlobalLinearId>;
-defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, Variable, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationIndex", OpenCL_std, Variable, LocalInvocationIndex>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkDim", OpenCL_std, Variable, WorkDim>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupSize", OpenCL_std, Variable, SubgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupMaxSize", OpenCL_std, Variable, SubgroupMaxSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumSubgroups", OpenCL_std, Variable, NumSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumEnqueuedSubgroups", OpenCL_std, Variable, NumEnqueuedSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupId", OpenCL_std, Variable, SubgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLocalInvocationId", OpenCL_std, Variable, SubgroupLocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMask", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMaskKHR", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMask", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMaskKHR", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMask", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMaskKHR", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMask", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMaskKHR", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMask", OpenCL_std, Variable, SubgroupLtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMaskKHR", OpenCL_std, Variable, SubgroupLtMask>;
// GetQuery builtin records:
defm : DemangledGetBuiltin<"get_local_id", OpenCL_std, GetQuery, LocalInvocationId>;
@@ -1375,6 +1392,14 @@ defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>;
defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>;
defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationId", OpenCL_std, GetQuery, LocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, GetQuery, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupSize", OpenCL_std, GetQuery, WorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalSize", OpenCL_std, GetQuery, GlobalSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupId", OpenCL_std, GetQuery, WorkgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInEnqueuedWorkgroupSize", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumWorkgroups", OpenCL_std, GetQuery, NumWorkgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalOffset", OpenCL_std, GetQuery, GlobalOffset>;
defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fd0bea0b9047..6608b3f2cbef 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3120,6 +3120,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract);
case Intrinsic::spv_normalize:
return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize);
+ case Intrinsic::spv_refract:
+ return selectExtInst(ResVReg, ResType, I, GL::Refract);
case Intrinsic::spv_reflect:
return selectExtInst(ResVReg, ResType, I, GL::Reflect);
case Intrinsic::spv_rsqrt:
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 2a581d381d4a..4a9c88bfa6d3 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -68,7 +68,7 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup,
// Extract the relocation type from the fixup kind, after applying STT_TLS as
// needed.
- unsigned Kind = Fixup.getTargetKind();
+ auto Kind = Fixup.getKind();
if (mc::isRelocation(Fixup.getKind()))
return Kind;
@@ -93,7 +93,7 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup,
}
// clang-format off
- switch(Fixup.getTargetKind()) {
+ switch(Fixup.getKind()) {
default:
llvm_unreachable("Unimplemented fixup -> relocation");
case FK_NONE: return ELF::R_SPARC_NONE;
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 233585346946..cfa3511436b9 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/SparcFixupKinds.h"
-#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "SparcMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 8588d2d28b71..cee671e34951 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -64,6 +64,10 @@ def FeatureOSA2011
: SubtargetFeature<"osa2011", "IsOSA2011", "true",
"Enable Oracle SPARC Architecture 2011 extensions",
[FeatureV9, FeatureVIS, FeatureVIS2, FeatureVIS3]>;
+def FeatureCrypto
+ : SubtargetFeature<"crypto", "IsCrypto", "true",
+ "Enable cryptographic extensions",
+ [FeatureOSA2011]>;
def FeatureLeon
: SubtargetFeature<"leon", "IsLeon", "true",
"Enable LEON extensions">;
@@ -175,7 +179,8 @@ def : Proc<"niagara3", [FeatureV9, FeatureV8Deprecated, UsePopc,
FeatureUA2005, FeatureUA2007]>;
def : Proc<"niagara4", [FeatureV9, FeatureV8Deprecated, UsePopc,
FeatureVIS, FeatureVIS2, FeatureVIS3,
- FeatureUA2005, FeatureUA2007, FeatureOSA2011]>;
+ FeatureUA2005, FeatureUA2007, FeatureOSA2011,
+ FeatureCrypto]>;
// LEON 2 FT generic
def : Processor<"leon2", LEON2Itineraries,
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 21dbe8f585b3..9b434d87c267 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1828,16 +1828,8 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
// .umul works for both signed and unsigned
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
- setLibcallImpl(RTLIB::MUL_I32, RTLIB::sparc_umul);
-
setOperationAction(ISD::SDIV, MVT::i32, Expand);
- setLibcallImpl(RTLIB::SDIV_I32, RTLIB::sparc_div);
-
setOperationAction(ISD::UDIV, MVT::i32, Expand);
- setLibcallImpl(RTLIB::UDIV_I32, RTLIB::sparc_udiv);
-
- setLibcallImpl(RTLIB::SREM_I32, RTLIB::sparc_rem);
- setLibcallImpl(RTLIB::UREM_I32, RTLIB::sparc_urem);
}
if (Subtarget->is64Bit()) {
@@ -1896,14 +1888,6 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FNEG, MVT::f128, Custom);
setOperationAction(ISD::FABS, MVT::f128, Custom);
}
-
- if (!Subtarget->is64Bit()) {
- setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::_Q_qtoll);
- setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::_Q_qtoull);
- setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::_Q_lltoq);
- setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::_Q_ulltoq);
- }
-
} else {
// Custom legalize f128 operations.
@@ -1948,10 +1932,6 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::_Q_qtou);
setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::_Q_itoq);
setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::_Q_utoq);
- setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::_Q_qtoll);
- setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::_Q_qtoull);
- setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::_Q_lltoq);
- setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::_Q_ulltoq);
setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::_Q_stoq);
setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::_Q_dtoq);
setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::_Q_qtos);
diff --git a/llvm/lib/Target/Sparc/SparcInstrCrypto.td b/llvm/lib/Target/Sparc/SparcInstrCrypto.td
new file mode 100644
index 000000000000..04b116c2ded8
--- /dev/null
+++ b/llvm/lib/Target/Sparc/SparcInstrCrypto.td
@@ -0,0 +1,98 @@
+//===----------- SparcInstrCrypto.td - cryptographic extensions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction formats, definitions and patterns needed for
+// cryptographic instructions on SPARC.
+//===----------------------------------------------------------------------===//
+
+
+// Convenience template for 4-operand instructions
+class FourOpImm<string OpcStr, bits<6> op3val, bits<4> op5val,
+ RegisterClass RC>
+ : F3_4<op3val, op5val, (outs RC:$rd), (ins RC:$rs1, RC:$rs2, simm5Op:$rs3),
+ !strconcat(OpcStr, " $rs1, $rs2, $rs3, $rd")>;
+
+let Predicates = [HasCrypto] in {
+def AES_EROUND01 : FourOp<"aes_eround01", 0b011001, 0b0000, DFPRegs>;
+def AES_EROUND23 : FourOp<"aes_eround23", 0b011001, 0b0001, DFPRegs>;
+def AES_DROUND01 : FourOp<"aes_dround01", 0b011001, 0b0010, DFPRegs>;
+def AES_DROUND23 : FourOp<"aes_dround23", 0b011001, 0b0011, DFPRegs>;
+def AES_EROUND01_LAST : FourOp<"aes_eround01_l", 0b011001, 0b0100, DFPRegs>;
+def AES_EROUND23_LAST : FourOp<"aes_eround23_l", 0b011001, 0b0101, DFPRegs>;
+def AES_DROUND01_LAST : FourOp<"aes_dround01_l", 0b011001, 0b0110, DFPRegs>;
+def AES_DROUND23_LAST : FourOp<"aes_dround23_l", 0b011001, 0b0111, DFPRegs>;
+def AES_KEXPAND0 : F3_3<2, 0b110110, 0b100110000,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "aes_kexpand0 $rs1, $rs2, $rd", []>;
+def AES_KEXPAND1 : FourOpImm<"aes_kexpand1", 0b011001, 0b1000, DFPRegs>;
+def AES_KEXPAND2 : F3_3<2, 0b110110, 0b100110001,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "aes_kexpand2 $rs1, $rs2, $rd", []>;
+
+def CAMELLIA_F : FourOp<"camellia_f", 0b011001, 0b1100, DFPRegs>;
+def CAMELLIA_FL : F3_3<2, 0b110110, 0b100111100,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "camellia_fl $rs1, $rs2, $rd", []>;
+def CAMELLIA_FLI : F3_3<2, 0b110110, 0b100111101,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "camellia_fli $rs1, $rs2, $rd", []>;
+
+def CRC32C : F3_3<2, 0b110110, 0b101000111,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+ "crc32c $rs1, $rs2, $rd", []>;
+
+def DES_ROUND : FourOp<"des_round", 0b011001, 0b1001, DFPRegs>;
+let rs2 = 0 in {
+def DES_IP : F3_3<2, 0b110110, 0b100110100,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1),
+ "des_ip $rs1, $rd", []>;
+def DES_IIP : F3_3<2, 0b110110, 0b100110101,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1),
+ "des_iip $rs1, $rd", []>;
+}
+def DES_KEXPAND : F3_3<2, 0b110110, 0b100110110,
+ (outs DFPRegs:$rd), (ins DFPRegs:$rs1, simm5Op:$rs2),
+ "des_kexpand $rs1, $rs2, $rd", []>;
+
+let rs1 = 0, rs2 = 0, rd = 0 in {
+let Uses = [D0, D1, D2, D5, D6, D7, D8, D9, D10, D11],
+ Defs = [D0, D1, D2, D3, D4, D5, D6, D7] in
+def MD5 : F3_3<2, 0b110110, 0b101000000, (outs), (ins), "md5", []>;
+let Uses = [D0, D1, D2, D4, D5, D6, D7, D8, D9, D10, D11],
+ Defs = [D0, D1, D2] in
+def SHA1 : F3_3<2, 0b110110, 0b101000001, (outs), (ins), "sha1", []>;
+let Uses = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11],
+ Defs = [D0, D1, D2, D3] in
+def SHA256 : F3_3<2, 0b110110, 0b101000010, (outs), (ins), "sha256", []>;
+let Uses = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11,
+ D12, D13, D14, D15, D16, D17, D18, D19, D20, D21, D22, D23],
+ Defs = [D0, D1, D2, D3, D4, D5, D6, D7] in
+def SHA512 : F3_3<2, 0b110110, 0b101000011, (outs), (ins), "sha512", []>;
+}
+
+// These instructions use and clobber all DFP and non-reserved Int registers.
+let rs1 = 0, rd = 0,
+Uses = [ D0, D1, D2, D3, D4, D5, D6, D7,
+ D8, D9, D10, D11, D12, D13, D14, D15,
+ D16, D17, D18, D19, D20, D21, D22, D23,
+ D24, D25, D26, D27, D28, D29, D30, D31,
+ O0, O1, O2, O3, O4, O5,
+ L0, L1, L2, L3, L4, L5, L6, L7,
+ I0, I1, I2, I3, I4, I5 ],
+Defs = [ D0, D1, D2, D3, D4, D5, D6, D7,
+ D8, D9, D10, D11, D12, D13, D14, D15,
+ D16, D17, D18, D19, D20, D21, D22, D23,
+ D24, D25, D26, D27, D28, D29, D30, D31,
+ O0, O1, O2, O3, O4, O5,
+ L0, L1, L2, L3, L4, L5, L6, L7,
+ I0, I1, I2, I3, I4, I5 ] in {
+def MPMUL : F3_3<2, 0b110110, 0b101001000, (outs), (ins simm5Op:$rs2), "mpmul $rs2", []>;
+def MONTMUL : F3_3<2, 0b110110, 0b101001001, (outs), (ins simm5Op:$rs2), "montmul $rs2", []>;
+def MONTSQR : F3_3<2, 0b110110, 0b101001010, (outs), (ins simm5Op:$rs2), "montsqr $rs2", []>;
+}
+} // Predicates = [HasCrypto]
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 1be017be1c64..1a32eafb0e83 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -60,6 +60,10 @@ def HasUA2007 : Predicate<"Subtarget->isUA2007()">,
def HasOSA2011 : Predicate<"Subtarget->isOSA2011()">,
AssemblerPredicate<(all_of FeatureOSA2011)>;
+// HasCrypto - This is true when the target processor has cryptographic extensions.
+def HasCrypto : Predicate<"Subtarget->isCrypto()">,
+ AssemblerPredicate<(all_of FeatureCrypto)>;
+
// HasHardQuad - This is true when the target processor supports quad floating
// point instructions.
def HasHardQuad : Predicate<"Subtarget->hasHardQuad()">;
@@ -2011,4 +2015,5 @@ def : Pat<(build_vector (i32 IntRegs:$a1), (i32 IntRegs:$a2)),
include "SparcInstr64Bit.td"
include "SparcInstrVIS.td"
include "SparcInstrUAOSA.td"
+include "SparcInstrCrypto.td"
include "SparcInstrAliases.td"
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index 711bf9b31a37..b19196475908 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "SparcTargetObjectFile.h"
-#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
diff --git a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 6ae529e97418..31b4f1196392 100644
--- a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -327,6 +327,8 @@ DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes,
uint64_t Address,
raw_ostream &CS) const {
+ CommentStream = &CS;
+
// Get the first two bytes of the instruction.
Size = 0;
if (Bytes.size() < 2)
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
index 9121f0d44936..3ef6030ba518 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
@@ -137,10 +137,10 @@ void SystemZHLASMAsmStreamer::EmitComment() {
}
void SystemZHLASMAsmStreamer::emitValueToAlignment(Align Alignment,
- int64_t Value,
- unsigned ValueSize,
+ int64_t Fill,
+ uint8_t FillLen,
unsigned MaxBytesToEmit) {
- emitAlignmentDS(Alignment.value(), Value, ValueSize, MaxBytesToEmit);
+ emitAlignmentDS(Alignment.value(), Fill, FillLen, MaxBytesToEmit);
}
void SystemZHLASMAsmStreamer::emitCodeAlignment(Align Alignment,
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.h
index c5275339ce01..93b1ac4d901a 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.h
@@ -86,9 +86,8 @@ public:
void emitAlignmentDS(uint64_t ByteAlignment, std::optional<int64_t> Value,
unsigned ValueSize, unsigned MaxBytesToEmit);
- void emitValueToAlignment(Align Alignment, int64_t Value = 0,
- unsigned ValueSize = 1,
- unsigned MaxBytesToEmit = 0) override;
+ void emitValueToAlignment(Align Alignment, int64_t Fill, uint8_t FillLen,
+ unsigned MaxBytesToEmit) override;
void emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
unsigned MaxBytesToEmit = 0) override;
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 4bef8ff9bbac..629791631080 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -533,7 +533,7 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF,
const SystemZSubtarget &STI = MF.getSubtarget<SystemZSubtarget>();
const SystemZTargetLowering &TLI = *STI.getTargetLowering();
MachineFrameInfo &MFFrame = MF.getFrameInfo();
- auto *ZII = static_cast<const SystemZInstrInfo *>(STI.getInstrInfo());
+ auto *ZII = STI.getInstrInfo();
SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
MachineBasicBlock::iterator MBBI = MBB.begin();
const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
@@ -1239,7 +1239,7 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
MachineBasicBlock::iterator MBBI = MBB.begin();
- auto *ZII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ auto *ZII = Subtarget.getInstrInfo();
auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
MachineFrameInfo &MFFrame = MF.getFrameInfo();
MachineInstr *StoreInstr = nullptr;
@@ -1354,7 +1354,7 @@ void SystemZXPLINKFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
MachineFrameInfo &MFFrame = MF.getFrameInfo();
- auto *ZII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ auto *ZII = Subtarget.getInstrInfo();
auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
// Skip the return instruction.
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 0920c3345ecf..9b03e85ca45b 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -191,6 +191,35 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
}
}
+void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(MCStreamer &Streamer,
+ Module &M) const {
+ NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName);
+ if (!FuncInfo)
+ return;
+
+ // Emit a descriptor for every function including functions that have an
+ // available external linkage. We may not want this for imported functions
+ // that has code in another thinLTO module but we don't have a good way to
+ // tell them apart from inline functions defined in header files. Therefore
+ // we put each descriptor in a separate comdat section and rely on the
+ // linker to deduplicate.
+ auto &C = getContext();
+ for (const auto *Operand : FuncInfo->operands()) {
+ const auto *MD = cast<MDNode>(Operand);
+ auto *GUID = mdconst::extract<ConstantInt>(MD->getOperand(0));
+ auto *Hash = mdconst::extract<ConstantInt>(MD->getOperand(1));
+ auto *Name = cast<MDString>(MD->getOperand(2));
+ auto *S = C.getObjectFileInfo()->getPseudoProbeDescSection(
+ TM->getFunctionSections() ? Name->getString() : StringRef());
+
+ Streamer.switchSection(S);
+ Streamer.emitInt64(GUID->getZExtValue());
+ Streamer.emitInt64(Hash->getZExtValue());
+ Streamer.emitULEB128IntValue(Name->getString().size());
+ Streamer.emitBytes(Name->getString());
+ }
+}
+
/// getKindForGlobal - This is a top-level target-independent classifier for
/// a global object. Given a global variable and information from the TM, this
/// function classifies the global in a target independent manner. This function
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
index e09a916d48c9..f98762152247 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -154,7 +154,7 @@ public:
void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
uint64_t Value, bool IsResolved) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case VE::fixup_ve_tls_gd_hi32:
case VE::fixup_ve_tls_gd_lo32:
case VE::fixup_ve_tpoff_hi32:
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
index 1597e7d080f0..41f31eb3b819 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -56,7 +56,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
}
if (IsPCRel) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind");
return ELF::R_VE_NONE;
@@ -84,7 +84,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
}
}
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
default:
reportError(Fixup.getLoc(), "Unknown ELF relocation type");
return ELF::R_VE_NONE;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index c591e5ef181a..d13862f12773 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1536,6 +1536,10 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
(vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c",
vec.prefix#".relaxed_nmadd", simdopS, reqs>;
+
+ def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
+ (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+
}
defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 656d5dd32773..28f65990120c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -261,7 +261,6 @@
///
///===----------------------------------------------------------------------===//
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "WebAssembly.h"
#include "WebAssemblyTargetMachine.h"
#include "llvm/ADT/StringExtras.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index ad47cb8ea2fe..6827ee652794 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -26,7 +26,6 @@
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
#include "llvm/InitializePasses.h"
-#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 6614eea3901b..564636959f00 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -14,6 +14,7 @@
#include "X86ATTInstPrinter.h"
#include "X86BaseInfo.h"
#include "X86InstComments.h"
+#include "llvm/ADT/SmallString.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -35,6 +36,21 @@ using namespace llvm;
#define PRINT_ALIAS_INSTR
#include "X86GenAsmWriter.inc"
+// Print an MCExpr as an operand. Similar to GCC, wrap the output in parentheses
+// if it begins with '$', as '$' in an operand position indicates an immediate
+// value in the AT&T syntax.
+void X86ATTInstPrinter::printExprOperand(raw_ostream &OS, const MCExpr &E) {
+ SmallString<128> S;
+ {
+ raw_svector_ostream SOS(S);
+ MAI.printExpr(SOS, E);
+ }
+ if (S.starts_with("$"))
+ OS << '(' << S << ')';
+ else
+ OS << S;
+}
+
void X86ATTInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) {
markup(OS, Markup::Register) << '%' << getRegisterName(Reg);
}
@@ -446,7 +462,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
O << formatImm(DispVal);
} else {
assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
- MAI.printExpr(O, *DispSpec.getExpr());
+ printExprOperand(O, *DispSpec.getExpr());
}
if (IndexReg.getReg() || BaseReg.getReg()) {
@@ -501,7 +517,7 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
O << formatImm(DispSpec.getImm());
} else {
assert(DispSpec.isExpr() && "non-immediate displacement?");
- MAI.printExpr(O, *DispSpec.getExpr());
+ printExprOperand(O, *DispSpec.getExpr());
}
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index f49f09c5dcf3..1452622ebcea 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -23,6 +23,7 @@ public:
const MCRegisterInfo &MRI)
: X86InstPrinterCommon(MAI, MII, MRI), HasCustomInstComment(false) {}
+ void printExprOperand(raw_ostream &OS, const MCExpr &E) override;
void printRegName(raw_ostream &OS, MCRegister Reg) override;
void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
const MCSubtargetInfo &STI, raw_ostream &OS) override;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index ff2df3d5b192..3d060c6f4a78 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -26,6 +26,7 @@
#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCValue.h"
#include "llvm/MC/TargetRegistry.h"
@@ -177,20 +178,20 @@ public:
bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
const MCSubtargetInfo &STI) const override;
- bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t,
+ bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &,
+ const MCValue &, uint64_t,
bool) const override;
void relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const override;
- bool padInstructionViaRelaxation(MCRelaxableFragment &RF,
- MCCodeEmitter &Emitter,
+ bool padInstructionViaRelaxation(MCFragment &RF, MCCodeEmitter &Emitter,
unsigned &RemainingSize) const;
- bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+ bool padInstructionViaPrefix(MCFragment &RF, MCCodeEmitter &Emitter,
unsigned &RemainingSize) const;
- bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+ bool padInstructionEncoding(MCFragment &RF, MCCodeEmitter &Emitter,
unsigned &RemainingSize) const;
bool finishLayout(const MCAssembler &Asm) const override;
@@ -409,10 +410,9 @@ isRightAfterData(MCFragment *CurrentFragment,
// it, returns true.
// - Otherwise returns false.
// - If the fragment is not a DataFragment, returns false.
- if (auto *DF = dyn_cast_or_null<MCDataFragment>(F))
- return DF->getContents().size() &&
- (DF != PrevInstPosition.first ||
- DF->getContents().size() != PrevInstPosition.second);
+ if (F->getKind() == MCFragment::FT_Data)
+ return F->getFixedSize() && (F != PrevInstPosition.first ||
+ F->getFixedSize() != PrevInstPosition.second);
return false;
}
@@ -421,11 +421,7 @@ isRightAfterData(MCFragment *CurrentFragment,
static size_t getSizeForInstFragment(const MCFragment *F) {
if (!F || !F->hasInstructions())
return 0;
- // MCEncodedFragmentWithContents being templated makes this tricky.
- if (auto *DF = dyn_cast<MCEncodedFragment>(F))
- return DF->getContents().size();
- else
- llvm_unreachable("Unknown fragment with instructions!");
+ return F->getSize();
}
/// Return true if we can insert NOP or prefixes automatically before the
@@ -468,10 +464,6 @@ bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const {
if (!OS.getCurrentSectionOnly()->isText())
return false;
- // To be Done: Currently don't deal with Bundle cases.
- if (OS.getAssembler().isBundlingEnabled())
- return false;
-
// Branches only need to be aligned in 32-bit or 64-bit mode.
if (!(STI.hasFeature(X86::Is64Bit) || STI.hasFeature(X86::Is32Bit)))
return false;
@@ -551,8 +543,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
const MCInst &Inst) {
MCFragment *CF = OS.getCurrentFragment();
- if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF))
- F->setAllowAutoPadding(canPadInst(Inst, OS));
+ if (CF->getKind() == MCFragment::FT_Relaxable)
+ CF->setAllowAutoPadding(canPadInst(Inst, OS));
// Update PrevInstOpcode here, canPadInst() reads that.
PrevInstOpcode = Inst.getOpcode();
@@ -575,8 +567,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
// DataFragment, so that we can get the size of instructions later in
// MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
// DataFragment.
- if (isa_and_nonnull<MCDataFragment>(CF))
- OS.insert(OS.getContext().allocFragment<MCDataFragment>());
+ OS.insert(OS.getContext().allocFragment<MCFragment>());
// Update the maximum alignment on the current section if necessary.
MCSection *Sec = OS.getCurrentSectionOnly();
@@ -686,7 +677,7 @@ std::optional<bool> X86AsmBackend::evaluateFixup(const MCFragment &,
MCFixup &Fixup,
MCValue &Target, uint64_t &) {
if (Fixup.isPCRel()) {
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case FK_Data_1:
Target.setConstant(Target.getConstant() - 1);
break;
@@ -756,7 +747,8 @@ bool X86AsmBackend::mayNeedRelaxation(unsigned Opcode,
Operands[Operands.size() - 1 - SkipOperands].isExpr());
}
-bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &,
+ const MCFixup &Fixup,
const MCValue &Target,
uint64_t Value,
bool Resolved) const {
@@ -785,7 +777,7 @@ void X86AsmBackend::relaxInstruction(MCInst &Inst,
Inst.setOpcode(RelaxedOp);
}
-bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
+bool X86AsmBackend::padInstructionViaPrefix(MCFragment &RF,
MCCodeEmitter &Emitter,
unsigned &RemainingSize) const {
if (!RF.getAllowAutoPadding())
@@ -798,7 +790,7 @@ bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
*RF.getSubtargetInfo()))
return false;
- const unsigned OldSize = RF.getContents().size();
+ const unsigned OldSize = RF.getVarSize();
if (OldSize == 15)
return false;
@@ -827,19 +819,18 @@ bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
SmallString<256> Code;
Code.append(PrefixBytesToAdd, Prefix);
- Code.append(RF.getContents().begin(), RF.getContents().end());
- RF.setContents(Code);
+ Code.append(RF.getVarContents().begin(), RF.getVarContents().end());
+ RF.setVarContents(Code);
// Adjust the fixups for the change in offsets
- for (auto &F : RF.getFixups()) {
- F.setOffset(F.getOffset() + PrefixBytesToAdd);
- }
+ for (auto &F : RF.getVarFixups())
+ F.setOffset(PrefixBytesToAdd + F.getOffset());
RemainingSize -= PrefixBytesToAdd;
return true;
}
-bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
+bool X86AsmBackend::padInstructionViaRelaxation(MCFragment &RF,
MCCodeEmitter &Emitter,
unsigned &RemainingSize) const {
if (!mayNeedRelaxation(RF.getOpcode(), RF.getOperands(),
@@ -854,20 +845,20 @@ bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
SmallVector<MCFixup, 4> Fixups;
SmallString<15> Code;
Emitter.encodeInstruction(Relaxed, Code, Fixups, *RF.getSubtargetInfo());
- const unsigned OldSize = RF.getContents().size();
+ const unsigned OldSize = RF.getVarContents().size();
const unsigned NewSize = Code.size();
assert(NewSize >= OldSize && "size decrease during relaxation?");
unsigned Delta = NewSize - OldSize;
if (Delta > RemainingSize)
return false;
RF.setInst(Relaxed);
- RF.setContents(Code);
- RF.setFixups(Fixups);
+ RF.setVarContents(Code);
+ RF.setVarFixups(Fixups);
RemainingSize -= Delta;
return true;
}
-bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
+bool X86AsmBackend::padInstructionEncoding(MCFragment &RF,
MCCodeEmitter &Emitter,
unsigned &RemainingSize) const {
bool Changed = false;
@@ -900,7 +891,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
if (!Sec.isText())
continue;
- SmallVector<MCRelaxableFragment *, 4> Relaxable;
+ SmallVector<MCFragment *, 4> Relaxable;
for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
MCFragment &F = *I;
@@ -911,7 +902,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
continue;
if (F.getKind() == MCFragment::FT_Relaxable) {
- auto &RF = cast<MCRelaxableFragment>(*I);
+ auto &RF = cast<MCFragment>(*I);
Relaxable.push_back(&RF);
continue;
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 7523d2aedcce..1c5f1663d4f5 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -26,6 +26,10 @@
using namespace llvm;
+void X86InstPrinterCommon::printExprOperand(raw_ostream &OS, const MCExpr &E) {
+ MAI.printExpr(OS, E);
+}
+
void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op,
raw_ostream &O) {
int64_t Imm = MI->getOperand(Op).getImm();
@@ -374,7 +378,7 @@ void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
markup(O, Markup::Immediate) << formatHex((uint64_t)Address);
} else {
// Otherwise, just print the expression.
- MAI.printExpr(O, *Op.getExpr());
+ printExprOperand(O, *Op.getExpr());
}
}
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index 2a7b750bd675..2c9467ca7c61 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -17,11 +17,13 @@
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
+class MCExpr;
class X86InstPrinterCommon : public MCInstPrinter {
public:
using MCInstPrinter::MCInstPrinter;
+ virtual void printExprOperand(raw_ostream &OS, const MCExpr &E);
virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0;
void printCondCode(const MCInst *MI, unsigned Op, raw_ostream &OS);
void printCondFlags(const MCInst *MI, unsigned Op, raw_ostream &OS);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index c34425f6661b..0dabd98a38f4 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -258,7 +258,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
// x86_64 distinguishes movq foo@GOTPCREL so that the linker can
// rewrite the movq to an leaq at link time if the symbol ends up in
// the same linkage unit.
- if (Fixup.getTargetKind() == X86::reloc_riprel_4byte_movq_load)
+ if (Fixup.getKind() == X86::reloc_riprel_4byte_movq_load)
Type = MachO::X86_64_RELOC_GOT_LOAD;
else
Type = MachO::X86_64_RELOC_GOT;
@@ -320,7 +320,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
return;
} else {
Type = MachO::X86_64_RELOC_UNSIGNED;
- if (Fixup.getTargetKind() == X86::reloc_signed_4byte) {
+ if (Fixup.getKind() == X86::reloc_signed_4byte) {
reportError(
Fixup.getLoc(),
"32-bit absolute addressing is not supported in 64-bit mode");
diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index 0e4add27cce0..7b2b9dda99b4 100644
--- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -239,8 +239,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
TFL = STI->getFrameLowering();
MRI = &MF.getRegInfo();
- const X86RegisterInfo &RegInfo =
- *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+ const X86RegisterInfo &RegInfo = *STI->getRegisterInfo();
SlotSize = RegInfo.getSlotSize();
assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size");
Log2SlotSize = Log2_32(SlotSize);
@@ -356,8 +355,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
CallContext &Context) {
// Check that this particular call sequence is amenable to the
// transformation.
- const X86RegisterInfo &RegInfo =
- *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+ const X86RegisterInfo &RegInfo = *STI->getRegisterInfo();
// We expect to enter this at the beginning of a call sequence
assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 0b4c63f7a81f..5d5a70589324 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
+/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized
+/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment.
+/// Technically only fp128 has a specified ABI, but it makes sense to handle
+/// i128 the same until we hear differently.
+static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ assert(ValVT == MVT::i32 && "Should have i32 parts");
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ assert(PendingMembers.size() == 4 && "Should have four parts");
+
+ int64_t Offset = State.AllocateStack(16, Align(16));
+ PendingMembers[0].convertToMem(Offset);
+ PendingMembers[1].convertToMem(Offset + 4);
+ PendingMembers[2].convertToMem(Offset + 8);
+ PendingMembers[3].convertToMem(Offset + 12);
+
+ State.addLoc(PendingMembers[0]);
+ State.addLoc(PendingMembers[1]);
+ State.addLoc(PendingMembers[2]);
+ State.addLoc(PendingMembers[3]);
+ PendingMembers.clear();
+ return true;
+}
+
// Provides entry points of CC_X86 and RetCC_X86.
#include "X86GenCallingConv.inc"
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 823e0caa0226..f020e0b55141 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[
// The 'nest' parameter, if any, is passed in ECX.
CCIfNest<CCAssignToReg<[ECX]>>,
+ // i128 and fp128 need to be passed on the stack with a higher alignment than
+ // their legal types. Handle this with a custom function.
+ CCIfType<[i32],
+ CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>,
+
// On swifttailcc pass swiftself in ECX.
CCIfCC<"CallingConv::SwiftTail",
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>,
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index c7abb367fad2..0e6b4dffec3a 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -376,8 +376,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::EH_RETURN64: {
MachineOperand &DestAddr = MBBI->getOperand(0);
assert(DestAddr.isReg() && "Offset should be in register!");
- const bool Uses64BitFramePtr =
- STI->isTarget64BitLP64() || STI->isTargetNaCl64();
+ const bool Uses64BitFramePtr = STI->isTarget64BitLP64();
Register StackPtr = TRI->getStackRegister();
BuildMI(MBB, MBBI, DL,
TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index c96d3c15a882..95ed5908e231 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -54,8 +54,8 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
SlotSize = TRI->getSlotSize();
Is64Bit = STI.is64Bit();
IsLP64 = STI.isTarget64BitLP64();
- // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
- Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+ // standard x86_64 uses 64-bit frame/stack pointers, x32 - 32-bit.
+ Uses64BitFramePtr = STI.isTarget64BitLP64();
StackPtr = TRI->getStackRegister();
}
@@ -2412,7 +2412,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
DebugLoc DL;
if (MBBI != MBB.end())
DL = MBBI->getDebugLoc();
- // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+ // standard x86_64 uses 64-bit frame/stack pointers, x32 - 32-bit.
const bool Is64BitILP32 = STI.isTarget64BitILP32();
Register FramePtr = TRI->getFrameRegister(MF);
Register MachineFramePtr =
@@ -4241,7 +4241,7 @@ void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {
for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
for (WinEHHandlerType &H : TBME.HandlerArray) {
int FrameIndex = H.CatchObj.FrameIndex;
- if (FrameIndex != INT_MAX) {
+ if ((FrameIndex != INT_MAX) && MFI.getObjectOffset(FrameIndex) == 0) {
// Ensure alignment.
unsigned Align = MFI.getObjectAlign(FrameIndex).value();
MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 32c7d2bfea6c..62073ec125e8 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5428,10 +5428,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
case ISD::BRIND:
case X86ISD::NT_BRIND: {
- if (Subtarget->isTargetNaCl())
- // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
- // leave the instruction alone.
- break;
if (Subtarget->isTarget64BitILP32()) {
// Converts a 32-bit register to a 64-bit, zero-extended version of
// it. This is needed because x86-64 can do many things, but jmp %r32
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5e35d5630d66..d91ea1ea1bb1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36615,8 +36615,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
sizeVReg = MI.getOperand(1).getReg(),
- physSPReg =
- IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
+ physSPReg = IsLP64 ? X86::RSP : X86::ESP;
MachineFunction::iterator MBBIter = ++BB->getIterator();
@@ -37121,8 +37120,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
// restoreMBB:
if (RegInfo->hasBasePointer(*MF)) {
- const bool Uses64BitFramePtr =
- Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+ const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
X86FI->setRestoreBasePointer(MF);
Register FramePtr = RegInfo->getFrameRegister(*MF);
@@ -37550,8 +37548,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
// Add a register mask with no preserved registers. This results in all
// registers being marked as clobbered.
if (RI.hasBasePointer(*MF)) {
- const bool FPIs64Bit =
- Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+ const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
MFI->setRestoreBasePointer(MF);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6bcb7a36e91b..26369792db26 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1661,7 +1661,7 @@ namespace llvm {
/// Lower interleaved load(s) into target specific
/// instructions/intrinsics.
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 9ad355311527..b4639ac2577e 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg,
const DataLayout &DL) const {
- // i128 split into i64 needs to be allocated to two consecutive registers,
- // or spilled to the stack as a whole.
- return Ty->isIntegerTy(128);
+ // On x86-64 i128 is split into two i64s and needs to be allocated to two
+ // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
+ // is split to four i32s and never actually passed in registers, but we use
+ // the consecutive register mark to match it in TableGen.
+ if (Ty->isIntegerTy(128))
+ return true;
+
+ // On x86-32, fp128 acts the same as i128.
+ if (Subtarget.is32Bit() && Ty->isFP128Ty())
+ return true;
+
+ return false;
}
/// Helper for getByValTypeAlignment to determine
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 307c03c8ef54..df1541e9085b 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -214,8 +214,6 @@ def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
}
def IsPS : Predicate<"Subtarget->isTargetPS()">;
def NotPS : Predicate<"!Subtarget->isTargetPS()">;
-def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
-def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 1eb47e3b2cd1..360293bce54e 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// number of shuffles and ISA.
// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
bool X86TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(Load);
+ if (!LI)
+ return false;
+ assert(!Mask && "Unexpected mask on a load");
+
// Create an interleaved access group.
IRBuilder<> Builder(LI);
X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index a8ee9f55611b..8ad8d423d10c 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -302,13 +302,12 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
reportFatalUsageError("64-bit code requested on a subtarget that doesn't "
"support it!");
- // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD, NaCl, and for all
+ // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD, and for all
// 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes
// following the i386 psABI, while on Illumos it is always 16 bytes.
if (StackAlignOverride)
stackAlignment = *StackAlignOverride;
- else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
- isTargetNaCl() || Is64Bit)
+ else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() || Is64Bit)
stackAlignment = Align(16);
// Consume the vector width attribute or apply any target specific limit.
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 38b8c246eb29..be49214e041e 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -170,14 +170,10 @@ public:
#include "X86GenSubtargetInfo.inc"
/// Is this x86_64 with the ILP32 programming model (x32 ABI)?
- bool isTarget64BitILP32() const {
- return Is64Bit && (TargetTriple.isX32() || TargetTriple.isOSNaCl());
- }
+ bool isTarget64BitILP32() const { return Is64Bit && (TargetTriple.isX32()); }
/// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
- bool isTarget64BitLP64() const {
- return Is64Bit && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl());
- }
+ bool isTarget64BitLP64() const { return Is64Bit && (!TargetTriple.isX32()); }
PICStyles::Style getPICStyle() const { return PICStyle; }
void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
@@ -299,9 +295,6 @@ public:
bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
- bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
- bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
- bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 85cc5b43d40b..6d9c6cdedd9e 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -131,7 +131,7 @@ static std::string computeDataLayout(const Triple &TT) {
Ret += DataLayout::getManglingComponent(TT);
// X86 and x32 have 32 bit pointers.
- if (!TT.isArch64Bit() || TT.isX32() || TT.isOSNaCl())
+ if (!TT.isArch64Bit() || TT.isX32())
Ret += "-p:32:32";
// Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
@@ -140,7 +140,7 @@ static std::string computeDataLayout(const Triple &TT) {
// Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
// 128 bit integers are not specified in the 32-bit ABIs but are used
// internally for lowering f128, so we match the alignment to that.
- if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
+ if (TT.isArch64Bit() || TT.isOSWindows())
Ret += "-i64:64-i128:128";
else if (TT.isOSIAMCU())
Ret += "-i64:32-f64:32";
@@ -148,7 +148,7 @@ static std::string computeDataLayout(const Triple &TT) {
Ret += "-i128:128-f64:32:64";
// Some ABIs align long double to 128 bits, others to 32.
- if (TT.isOSNaCl() || TT.isOSIAMCU())
+ if (TT.isOSIAMCU())
; // No f80
else if (TT.isArch64Bit() || TT.isOSDarwin() || TT.isWindowsMSVCEnvironment())
Ret += "-f80:128";
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index 27111fce4566..a650f6f069e5 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -811,7 +811,7 @@ void WinEHStatePass::updateEspForInAllocas(Function &F) {
if (auto *Alloca = dyn_cast<AllocaInst>(&I)) {
if (Alloca->isStaticAlloca())
continue;
- IRBuilder<> Builder(Alloca->getNextNonDebugInstruction());
+ IRBuilder<> Builder(Alloca->getNextNode());
// SavedESP = llvm.stacksave()
Value *SP = Builder.CreateStackSave();
Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
@@ -820,7 +820,7 @@ void WinEHStatePass::updateEspForInAllocas(Function &F) {
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
if (II->getIntrinsicID() != Intrinsic::stackrestore)
continue;
- IRBuilder<> Builder(II->getNextNonDebugInstruction());
+ IRBuilder<> Builder(II->getNextNode());
// SavedESP = llvm.stacksave()
Value *SP = Builder.CreateStackSave();
Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
index 671f1d04daf2..9167794a51e8 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
@@ -144,7 +144,7 @@ std::optional<bool> XtensaAsmBackend::evaluateFixup(const MCFragment &F,
// For a few PC-relative fixups, offsets need to be aligned down. We
// compensate here because the default handler's `Value` decrement doesn't
// account for this alignment.
- switch (Fixup.getTargetKind()) {
+ switch (Fixup.getKind()) {
case Xtensa::fixup_xtensa_call_18:
case Xtensa::fixup_xtensa_l32r_16:
Value = (Asm->getFragmentOffset(F) + Fixup.getOffset()) % 4;