summaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorKoakuma <koachan@protonmail.com>2024-07-08 19:19:54 +0700
committerKoakuma <koachan@protonmail.com>2024-07-08 19:19:54 +0700
commit5c4fdc2fd5898ebd9e89999a4f4b8aa289ca637f (patch)
treef3b92a07f3dfc6e70f36d1000605f36a3c15af46 /llvm/lib/Target
parentdbda8e2f2cd8764e0badd983915d62a2c3377f4d (diff)
parente9b8cd0c806db00f0981fb36717077c941426302 (diff)
Created using spr 1.3.5 [skip ci]
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td1
-rw-r--r--llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp178
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp48
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.td22
-rw-r--r--llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp5
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64FMV.td99
-rw-r--r--llvm/lib/Target/AArch64/AArch64Features.td872
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp261
-rw-r--r--llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def26
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp103
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp555
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h15
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td75
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrGISel.td8
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp622
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td93
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h21
-rw-r--r--llvm/lib/Target/AArch64/AArch64Processors.td261
-rw-r--r--llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td69
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td8
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h19
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp16
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp118
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h10
-rw-r--r--llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp4
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp15
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp28
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp38
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp4
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp7
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp21
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp7
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp70
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h13
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp37
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h1
-rw-r--r--llvm/lib/Target/AArch64/SMEInstrFormats.td43
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td91
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp267
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp71
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp39
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td39
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp162
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp294
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp151
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp274
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp221
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td287
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td61
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h7
-rw-r--r--llvm/lib/Target/AMDGPU/EvergreenInstructions.td42
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td227
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp129
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h10
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h29
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp96
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h58
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp54
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h15
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/R600Packetizer.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp668
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td56
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td41
-rw-r--r--llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp68
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.cpp39
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td17
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp58
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp66
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td17
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h11
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp41
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h14
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h39
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp113
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h24
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp46
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h79
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td37
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td20
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td51
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td14
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td27
-rw-r--r--llvm/lib/Target/ARC/ARCBranchFinalize.cpp2
-rw-r--r--llvm/lib/Target/ARC/ARCOptAddrMode.cpp8
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMConstantIslandPass.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp84
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h5
-rw-r--r--llvm/lib/Target/ARM/ARMInstrInfo.td9
-rw-r--r--llvm/lib/Target/ARM/ARMInstrThumb.td6
-rw-r--r--llvm/lib/Target/ARM/ARMInstrThumb2.td6
-rw-r--r--llvm/lib/Target/ARM/ARMLegalizerInfo.cpp21
-rw-r--r--llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp8
-rw-r--r--llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp1
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.cpp5
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp24
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp5
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp36
-rw-r--r--llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp9
-rw-r--r--llvm/lib/Target/AVR/AVRISelLowering.cpp8
-rw-r--r--llvm/lib/Target/AVR/AVRInstrInfo.td407
-rw-r--r--llvm/lib/Target/BPF/BPFInstrInfo.td30
-rw-r--r--llvm/lib/Target/BPF/BPFRegisterInfo.cpp24
-rw-r--r--llvm/lib/Target/BPF/BPFTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/BPF/BPFTargetMachine.h3
-rw-r--r--llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp2
-rw-r--r--llvm/lib/Target/DirectX/DXContainerGlobals.cpp31
-rw-r--r--llvm/lib/Target/DirectX/DXIL.td19
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DirectXTargetMachine.h3
-rw-r--r--llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp6
-rw-r--r--llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenInsert.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp7
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp6
-rw-r--r--llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp6
-rw-r--r--llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp18
-rw-r--r--llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp6
-rw-r--r--llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp6
-rw-r--r--llvm/lib/Target/Hexagon/HexagonSubtarget.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.h3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp6
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp18
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp66
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.h2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp15
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.h2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.td97
-rw-r--r--llvm/lib/Target/M68k/M68kInstrAtomics.td2
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp3
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h3
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp3
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp31
-rw-r--r--llvm/lib/Target/Mips/Mips64InstrInfo.td24
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsInstrInfo.td78
-rw-r--r--llvm/lib/Target/Mips/MipsOptimizePICCall.cpp5
-rw-r--r--llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp7
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp6
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp233
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td447
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp77
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetMachine.h3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXUtilities.cpp143
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXUtilities.h1
-rw-r--r--llvm/lib/Target/NVPTX/NVVMReflect.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp12
-rw-r--r--llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp6
-rw-r--r--llvm/lib/Target/PowerPC/PPCFrameLowering.cpp5
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp12
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstr64Bit.td34
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.td74
-rw-r--r--llvm/lib/Target/PowerPC/PPCMIPeephole.cpp19
-rw-r--r--llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp4
-rw-r--r--llvm/lib/Target/PowerPC/PPCRegisterInfo.td2
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.cpp9
-rw-r--r--llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp6
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp21
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp15
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp16
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp23
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp7
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp7
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp48
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp3
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h2
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCV.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td44
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp298
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp271
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoA.td82
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoM.td20
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td165
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td33
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td13
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZa.td46
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZc.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td37
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td102
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td189
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedule.td104
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.cpp9
-rw-r--r--llvm/lib/Target/RISCV/RISCVSystemOperands.td23
-rw-r--r--llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp7
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h5
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp279
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.td114
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h226
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp64
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp94
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h22
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVISelLowering.h5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.td16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp95
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp27
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp9
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp68
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.cpp6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.h24
-rw-r--r--llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp128
-rw-r--r--llvm/lib/Target/Sparc/SparcInstr64Bit.td2
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.td8
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.td16
-rw-r--r--llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp5
-rw-r--r--llvm/lib/Target/VE/VEInstrInfo.td8
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp8
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp15
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td46
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td40
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp3
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp242
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp5
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp82
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h14
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp5
-rw-r--r--llvm/lib/Target/X86/X86.td2
-rw-r--r--llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp3
-rw-r--r--llvm/lib/Target/X86/X86CompressEVEX.cpp95
-rw-r--r--llvm/lib/Target/X86/X86FastPreTileConfig.cpp18
-rw-r--r--llvm/lib/Target/X86/X86FastTileConfig.cpp9
-rw-r--r--llvm/lib/Target/X86/X86FixupSetCC.cpp38
-rw-r--r--llvm/lib/Target/X86/X86FlagsCopyLowering.cpp5
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp515
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h12
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp3
-rw-r--r--llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp16
-rw-r--r--llvm/lib/Target/X86/X86InstrArithmetic.td432
-rw-r--r--llvm/lib/Target/X86/X86InstrCMovSetCC.td21
-rw-r--r--llvm/lib/Target/X86/X86InstrCompiler.td8
-rw-r--r--llvm/lib/Target/X86/X86InstrFragments.td12
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp47
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h3
-rw-r--r--llvm/lib/Target/X86/X86InstrMisc.td8
-rw-r--r--llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp11
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp99
-rw-r--r--llvm/lib/Target/X86/X86MachineFunctionInfo.cpp13
-rw-r--r--llvm/lib/Target/X86/X86MachineFunctionInfo.h39
-rw-r--r--llvm/lib/Target/X86/X86PreTileConfig.cpp7
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver4.td14
-rw-r--r--llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp3
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp20
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.h11
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp202
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h1
-rw-r--r--llvm/lib/Target/X86/X86TileConfig.cpp5
-rw-r--r--llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp2
-rw-r--r--llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp2
-rw-r--r--llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp71
-rw-r--r--llvm/lib/Target/Xtensa/XtensaAsmPrinter.h2
-rw-r--r--llvm/lib/Target/Xtensa/XtensaISelLowering.cpp94
-rw-r--r--llvm/lib/Target/Xtensa/XtensaISelLowering.h11
-rw-r--r--llvm/lib/Target/Xtensa/XtensaInstrInfo.td15
-rw-r--r--llvm/lib/Target/Xtensa/XtensaOperators.td4
340 files changed, 9891 insertions, 6289 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 5708b6173750..2c1a9cfa67a6 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -19,6 +19,7 @@ include "llvm/Target/Target.td"
// Subtarget features.
//===----------------------------------------------------------------------===//
include "AArch64Features.td"
+include "AArch64FMV.td"
//===----------------------------------------------------------------------===//
// Register File Description
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 0ec15d34cd4a..f2c38b09c648 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -46,6 +46,18 @@ static cl::opt<bool> GenerateThunks("arm64ec-generate-thunks", cl::Hidden,
namespace {
+enum ThunkArgTranslation : uint8_t {
+ Direct,
+ Bitcast,
+ PointerIndirection,
+};
+
+struct ThunkArgInfo {
+ Type *Arm64Ty;
+ Type *X64Ty;
+ ThunkArgTranslation Translation;
+};
+
class AArch64Arm64ECCallLowering : public ModulePass {
public:
static char ID;
@@ -74,25 +86,30 @@ private:
void getThunkType(FunctionType *FT, AttributeList AttrList,
Arm64ECThunkType TT, raw_ostream &Out,
- FunctionType *&Arm64Ty, FunctionType *&X64Ty);
+ FunctionType *&Arm64Ty, FunctionType *&X64Ty,
+ SmallVector<ThunkArgTranslation> &ArgTranslations);
void getThunkRetType(FunctionType *FT, AttributeList AttrList,
raw_ostream &Out, Type *&Arm64RetTy, Type *&X64RetTy,
SmallVectorImpl<Type *> &Arm64ArgTypes,
- SmallVectorImpl<Type *> &X64ArgTypes, bool &HasSretPtr);
+ SmallVectorImpl<Type *> &X64ArgTypes,
+ SmallVector<ThunkArgTranslation> &ArgTranslations,
+ bool &HasSretPtr);
void getThunkArgTypes(FunctionType *FT, AttributeList AttrList,
Arm64ECThunkType TT, raw_ostream &Out,
SmallVectorImpl<Type *> &Arm64ArgTypes,
- SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr);
- void canonicalizeThunkType(Type *T, Align Alignment, bool Ret,
- uint64_t ArgSizeBytes, raw_ostream &Out,
- Type *&Arm64Ty, Type *&X64Ty);
+ SmallVectorImpl<Type *> &X64ArgTypes,
+ SmallVectorImpl<ThunkArgTranslation> &ArgTranslations,
+ bool HasSretPtr);
+ ThunkArgInfo canonicalizeThunkType(Type *T, Align Alignment, bool Ret,
+ uint64_t ArgSizeBytes, raw_ostream &Out);
};
} // end anonymous namespace
void AArch64Arm64ECCallLowering::getThunkType(
FunctionType *FT, AttributeList AttrList, Arm64ECThunkType TT,
- raw_ostream &Out, FunctionType *&Arm64Ty, FunctionType *&X64Ty) {
+ raw_ostream &Out, FunctionType *&Arm64Ty, FunctionType *&X64Ty,
+ SmallVector<ThunkArgTranslation> &ArgTranslations) {
Out << (TT == Arm64ECThunkType::Entry ? "$ientry_thunk$cdecl$"
: "$iexit_thunk$cdecl$");
@@ -111,10 +128,10 @@ void AArch64Arm64ECCallLowering::getThunkType(
bool HasSretPtr = false;
getThunkRetType(FT, AttrList, Out, Arm64RetTy, X64RetTy, Arm64ArgTypes,
- X64ArgTypes, HasSretPtr);
+ X64ArgTypes, ArgTranslations, HasSretPtr);
getThunkArgTypes(FT, AttrList, TT, Out, Arm64ArgTypes, X64ArgTypes,
- HasSretPtr);
+ ArgTranslations, HasSretPtr);
Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes, false);
@@ -124,7 +141,8 @@ void AArch64Arm64ECCallLowering::getThunkType(
void AArch64Arm64ECCallLowering::getThunkArgTypes(
FunctionType *FT, AttributeList AttrList, Arm64ECThunkType TT,
raw_ostream &Out, SmallVectorImpl<Type *> &Arm64ArgTypes,
- SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr) {
+ SmallVectorImpl<Type *> &X64ArgTypes,
+ SmallVectorImpl<ThunkArgTranslation> &ArgTranslations, bool HasSretPtr) {
Out << "$";
if (FT->isVarArg()) {
@@ -153,17 +171,20 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
for (int i = HasSretPtr ? 1 : 0; i < 4; i++) {
Arm64ArgTypes.push_back(I64Ty);
X64ArgTypes.push_back(I64Ty);
+ ArgTranslations.push_back(ThunkArgTranslation::Direct);
}
// x4
Arm64ArgTypes.push_back(PtrTy);
X64ArgTypes.push_back(PtrTy);
+ ArgTranslations.push_back(ThunkArgTranslation::Direct);
// x5
Arm64ArgTypes.push_back(I64Ty);
if (TT != Arm64ECThunkType::Entry) {
// FIXME: x5 isn't actually used by the x64 side; revisit once we
// have proper isel for varargs
X64ArgTypes.push_back(I64Ty);
+ ArgTranslations.push_back(ThunkArgTranslation::Direct);
}
return;
}
@@ -187,18 +208,20 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
uint64_t ArgSizeBytes = 0;
Align ParamAlign = Align();
#endif
- Type *Arm64Ty, *X64Ty;
- canonicalizeThunkType(FT->getParamType(I), ParamAlign,
- /*Ret*/ false, ArgSizeBytes, Out, Arm64Ty, X64Ty);
+ auto [Arm64Ty, X64Ty, ArgTranslation] =
+ canonicalizeThunkType(FT->getParamType(I), ParamAlign,
+ /*Ret*/ false, ArgSizeBytes, Out);
Arm64ArgTypes.push_back(Arm64Ty);
X64ArgTypes.push_back(X64Ty);
+ ArgTranslations.push_back(ArgTranslation);
}
}
void AArch64Arm64ECCallLowering::getThunkRetType(
FunctionType *FT, AttributeList AttrList, raw_ostream &Out,
Type *&Arm64RetTy, Type *&X64RetTy, SmallVectorImpl<Type *> &Arm64ArgTypes,
- SmallVectorImpl<Type *> &X64ArgTypes, bool &HasSretPtr) {
+ SmallVectorImpl<Type *> &X64ArgTypes,
+ SmallVector<ThunkArgTranslation> &ArgTranslations, bool &HasSretPtr) {
Type *T = FT->getReturnType();
#if 0
// FIXME: Need more information about argument size; see
@@ -209,35 +232,44 @@ void AArch64Arm64ECCallLowering::getThunkRetType(
#endif
if (T->isVoidTy()) {
if (FT->getNumParams()) {
- auto SRetAttr = AttrList.getParamAttr(0, Attribute::StructRet);
- auto InRegAttr = AttrList.getParamAttr(0, Attribute::InReg);
- if (SRetAttr.isValid() && InRegAttr.isValid()) {
+ Attribute SRetAttr0 = AttrList.getParamAttr(0, Attribute::StructRet);
+ Attribute InRegAttr0 = AttrList.getParamAttr(0, Attribute::InReg);
+ Attribute SRetAttr1, InRegAttr1;
+ if (FT->getNumParams() > 1) {
+ // Also check the second parameter (for class methods, the first
+ // parameter is "this", and the second parameter is the sret pointer.)
+ // It doesn't matter which one is sret.
+ SRetAttr1 = AttrList.getParamAttr(1, Attribute::StructRet);
+ InRegAttr1 = AttrList.getParamAttr(1, Attribute::InReg);
+ }
+ if ((SRetAttr0.isValid() && InRegAttr0.isValid()) ||
+ (SRetAttr1.isValid() && InRegAttr1.isValid())) {
// sret+inreg indicates a call that returns a C++ class value. This is
// actually equivalent to just passing and returning a void* pointer
- // as the first argument. Translate it that way, instead of trying
- // to model "inreg" in the thunk's calling convention, to simplify
- // the rest of the code.
+ // as the first or second argument. Translate it that way, instead of
+ // trying to model "inreg" in the thunk's calling convention; this
+ // simplfies the rest of the code, and matches MSVC mangling.
Out << "i8";
Arm64RetTy = I64Ty;
X64RetTy = I64Ty;
return;
}
- if (SRetAttr.isValid()) {
+ if (SRetAttr0.isValid()) {
// FIXME: Sanity-check the sret type; if it's an integer or pointer,
// we'll get screwy mangling/codegen.
// FIXME: For large struct types, mangle as an integer argument and
// integer return, so we can reuse more thunks, instead of "m" syntax.
// (MSVC mangles this case as an integer return with no argument, but
// that's a miscompile.)
- Type *SRetType = SRetAttr.getValueAsType();
+ Type *SRetType = SRetAttr0.getValueAsType();
Align SRetAlign = AttrList.getParamAlignment(0).valueOrOne();
- Type *Arm64Ty, *X64Ty;
canonicalizeThunkType(SRetType, SRetAlign, /*Ret*/ true, ArgSizeBytes,
- Out, Arm64Ty, X64Ty);
+ Out);
Arm64RetTy = VoidTy;
X64RetTy = VoidTy;
Arm64ArgTypes.push_back(FT->getParamType(0));
X64ArgTypes.push_back(FT->getParamType(0));
+ ArgTranslations.push_back(ThunkArgTranslation::Direct);
HasSretPtr = true;
return;
}
@@ -249,8 +281,10 @@ void AArch64Arm64ECCallLowering::getThunkRetType(
return;
}
- canonicalizeThunkType(T, Align(), /*Ret*/ true, ArgSizeBytes, Out, Arm64RetTy,
- X64RetTy);
+ auto info =
+ canonicalizeThunkType(T, Align(), /*Ret*/ true, ArgSizeBytes, Out);
+ Arm64RetTy = info.Arm64Ty;
+ X64RetTy = info.X64Ty;
if (X64RetTy->isPointerTy()) {
// If the X64 type is canonicalized to a pointer, that means it's
// passed/returned indirectly. For a return value, that means it's an
@@ -260,21 +294,33 @@ void AArch64Arm64ECCallLowering::getThunkRetType(
}
}
-void AArch64Arm64ECCallLowering::canonicalizeThunkType(
- Type *T, Align Alignment, bool Ret, uint64_t ArgSizeBytes, raw_ostream &Out,
- Type *&Arm64Ty, Type *&X64Ty) {
+ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType(
+ Type *T, Align Alignment, bool Ret, uint64_t ArgSizeBytes,
+ raw_ostream &Out) {
+
+ auto direct = [](Type *T) {
+ return ThunkArgInfo{T, T, ThunkArgTranslation::Direct};
+ };
+
+ auto bitcast = [this](Type *Arm64Ty, uint64_t SizeInBytes) {
+ return ThunkArgInfo{Arm64Ty,
+ llvm::Type::getIntNTy(M->getContext(), SizeInBytes * 8),
+ ThunkArgTranslation::Bitcast};
+ };
+
+ auto pointerIndirection = [this](Type *Arm64Ty) {
+ return ThunkArgInfo{Arm64Ty, PtrTy,
+ ThunkArgTranslation::PointerIndirection};
+ };
+
if (T->isFloatTy()) {
Out << "f";
- Arm64Ty = T;
- X64Ty = T;
- return;
+ return direct(T);
}
if (T->isDoubleTy()) {
Out << "d";
- Arm64Ty = T;
- X64Ty = T;
- return;
+ return direct(T);
}
if (T->isFloatingPointTy()) {
@@ -297,16 +343,14 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType(
Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes;
if (Alignment.value() >= 16 && !Ret)
Out << "a" << Alignment.value();
- Arm64Ty = T;
if (TotalSizeBytes <= 8) {
// Arm64 returns small structs of float/double in float registers;
// X64 uses RAX.
- X64Ty = llvm::Type::getIntNTy(M->getContext(), TotalSizeBytes * 8);
+ return bitcast(T, TotalSizeBytes);
} else {
// Struct is passed directly on Arm64, but indirectly on X64.
- X64Ty = PtrTy;
+ return pointerIndirection(T);
}
- return;
} else if (T->isFloatingPointTy()) {
report_fatal_error("Only 32 and 64 bit floating points are supported for "
"ARM64EC thunks");
@@ -315,9 +359,7 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType(
if ((T->isIntegerTy() || T->isPointerTy()) && DL.getTypeSizeInBits(T) <= 64) {
Out << "i8";
- Arm64Ty = I64Ty;
- X64Ty = I64Ty;
- return;
+ return direct(I64Ty);
}
unsigned TypeSize = ArgSizeBytes;
@@ -329,13 +371,12 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType(
if (Alignment.value() >= 16 && !Ret)
Out << "a" << Alignment.value();
// FIXME: Try to canonicalize Arm64Ty more thoroughly?
- Arm64Ty = T;
if (TypeSize == 1 || TypeSize == 2 || TypeSize == 4 || TypeSize == 8) {
// Pass directly in an integer register
- X64Ty = llvm::Type::getIntNTy(M->getContext(), TypeSize * 8);
+ return bitcast(T, TypeSize);
} else {
// Passed directly on Arm64, but indirectly on X64.
- X64Ty = PtrTy;
+ return pointerIndirection(T);
}
}
@@ -346,8 +387,9 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
SmallString<256> ExitThunkName;
llvm::raw_svector_ostream ExitThunkStream(ExitThunkName);
FunctionType *Arm64Ty, *X64Ty;
+ SmallVector<ThunkArgTranslation> ArgTranslations;
getThunkType(FT, Attrs, Arm64ECThunkType::Exit, ExitThunkStream, Arm64Ty,
- X64Ty);
+ X64Ty, ArgTranslations);
if (Function *F = M->getFunction(ExitThunkName))
return F;
@@ -378,6 +420,7 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
SmallVector<Value *> Args;
// Pass the called function in x9.
+ auto X64TyOffset = 1;
Args.push_back(F->arg_begin());
Type *RetTy = Arm64Ty->getReturnType();
@@ -387,10 +430,14 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
// pointer.
if (DL.getTypeStoreSize(RetTy) > 8) {
Args.push_back(IRB.CreateAlloca(RetTy));
+ X64TyOffset++;
}
}
- for (auto &Arg : make_range(F->arg_begin() + 1, F->arg_end())) {
+ for (auto [Arg, X64ArgType, ArgTranslation] : llvm::zip_equal(
+ make_range(F->arg_begin() + 1, F->arg_end()),
+ make_range(X64Ty->param_begin() + X64TyOffset, X64Ty->param_end()),
+ ArgTranslations)) {
// Translate arguments from AArch64 calling convention to x86 calling
// convention.
//
@@ -405,18 +452,20 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
// with an attribute.)
//
// The first argument is the called function, stored in x9.
- if (Arg.getType()->isArrayTy() || Arg.getType()->isStructTy() ||
- DL.getTypeStoreSize(Arg.getType()) > 8) {
+ if (ArgTranslation != ThunkArgTranslation::Direct) {
Value *Mem = IRB.CreateAlloca(Arg.getType());
IRB.CreateStore(&Arg, Mem);
- if (DL.getTypeStoreSize(Arg.getType()) <= 8) {
+ if (ArgTranslation == ThunkArgTranslation::Bitcast) {
Type *IntTy = IRB.getIntNTy(DL.getTypeStoreSizeInBits(Arg.getType()));
Args.push_back(IRB.CreateLoad(IntTy, IRB.CreateBitCast(Mem, PtrTy)));
- } else
+ } else {
+ assert(ArgTranslation == ThunkArgTranslation::PointerIndirection);
Args.push_back(Mem);
+ }
} else {
Args.push_back(&Arg);
}
+ assert(Args.back()->getType() == X64ArgType);
}
// FIXME: Transfer necessary attributes? sret? anything else?
@@ -450,8 +499,10 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
SmallString<256> EntryThunkName;
llvm::raw_svector_ostream EntryThunkStream(EntryThunkName);
FunctionType *Arm64Ty, *X64Ty;
+ SmallVector<ThunkArgTranslation> ArgTranslations;
getThunkType(F->getFunctionType(), F->getAttributes(),
- Arm64ECThunkType::Entry, EntryThunkStream, Arm64Ty, X64Ty);
+ Arm64ECThunkType::Entry, EntryThunkStream, Arm64Ty, X64Ty,
+ ArgTranslations);
if (Function *F = M->getFunction(EntryThunkName))
return F;
@@ -463,7 +514,6 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
// Copy MSVC, and always set up a frame pointer. (Maybe this isn't necessary.)
Thunk->addFnAttr("frame-pointer", "all");
- auto &DL = M->getDataLayout();
BasicBlock *BB = BasicBlock::Create(M->getContext(), "", Thunk);
IRBuilder<> IRB(BB);
@@ -472,24 +522,28 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
bool TransformDirectToSRet = X64RetType->isVoidTy() && !RetTy->isVoidTy();
unsigned ThunkArgOffset = TransformDirectToSRet ? 2 : 1;
- unsigned PassthroughArgSize = F->isVarArg() ? 5 : Thunk->arg_size();
+ unsigned PassthroughArgSize =
+ (F->isVarArg() ? 5 : Thunk->arg_size()) - ThunkArgOffset;
+ assert(ArgTranslations.size() == F->isVarArg() ? 5 : PassthroughArgSize);
// Translate arguments to call.
SmallVector<Value *> Args;
- for (unsigned i = ThunkArgOffset, e = PassthroughArgSize; i != e; ++i) {
- Value *Arg = Thunk->getArg(i);
- Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset);
- if (ArgTy->isArrayTy() || ArgTy->isStructTy() ||
- DL.getTypeStoreSize(ArgTy) > 8) {
+ for (unsigned i = 0; i != PassthroughArgSize; ++i) {
+ Value *Arg = Thunk->getArg(i + ThunkArgOffset);
+ Type *ArgTy = Arm64Ty->getParamType(i);
+ ThunkArgTranslation ArgTranslation = ArgTranslations[i];
+ if (ArgTranslation != ThunkArgTranslation::Direct) {
// Translate array/struct arguments to the expected type.
- if (DL.getTypeStoreSize(ArgTy) <= 8) {
+ if (ArgTranslation == ThunkArgTranslation::Bitcast) {
Value *CastAlloca = IRB.CreateAlloca(ArgTy);
IRB.CreateStore(Arg, IRB.CreateBitCast(CastAlloca, PtrTy));
Arg = IRB.CreateLoad(ArgTy, CastAlloca);
} else {
+ assert(ArgTranslation == ThunkArgTranslation::PointerIndirection);
Arg = IRB.CreateLoad(ArgTy, IRB.CreateBitCast(Arg, PtrTy));
}
}
+ assert(Arg->getType() == ArgTy);
Args.push_back(Arg);
}
@@ -549,8 +603,10 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
llvm::raw_null_ostream NullThunkName;
FunctionType *Arm64Ty, *X64Ty;
+ SmallVector<ThunkArgTranslation> ArgTranslations;
getThunkType(F->getFunctionType(), F->getAttributes(),
- Arm64ECThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty);
+ Arm64ECThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty,
+ ArgTranslations);
auto MangledName = getArm64ECMangledFunctionName(F->getName().str());
assert(MangledName && "Can't guest exit to function that's already native");
std::string ThunkName = *MangledName;
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 7da540f8ef8e..da11539eab34 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -90,6 +90,8 @@ public:
return MCInstLowering.lowerOperand(MO, MCOp);
}
+ const MCExpr *lowerConstantPtrAuth(const ConstantPtrAuth &CPA) override;
+
void emitStartOfAsmFile(Module &M) override;
void emitJumpTableInfo() override;
std::tuple<const MCSymbol *, uint64_t, const MCSymbol *,
@@ -1575,6 +1577,52 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
}
+const MCExpr *
+AArch64AsmPrinter::lowerConstantPtrAuth(const ConstantPtrAuth &CPA) {
+ MCContext &Ctx = OutContext;
+
+ // Figure out the base symbol and the addend, if any.
+ APInt Offset(64, 0);
+ const Value *BaseGV = CPA.getPointer()->stripAndAccumulateConstantOffsets(
+ getDataLayout(), Offset, /*AllowNonInbounds=*/true);
+
+ auto *BaseGVB = dyn_cast<GlobalValue>(BaseGV);
+
+ // If we can't understand the referenced ConstantExpr, there's nothing
+ // else we can do: emit an error.
+ if (!BaseGVB) {
+ BaseGV->getContext().emitError(
+ "cannot resolve target base/addend of ptrauth constant");
+ return nullptr;
+ }
+
+ // If there is an addend, turn that into the appropriate MCExpr.
+ const MCExpr *Sym = MCSymbolRefExpr::create(getSymbol(BaseGVB), Ctx);
+ if (Offset.sgt(0))
+ Sym = MCBinaryExpr::createAdd(
+ Sym, MCConstantExpr::create(Offset.getSExtValue(), Ctx), Ctx);
+ else if (Offset.slt(0))
+ Sym = MCBinaryExpr::createSub(
+ Sym, MCConstantExpr::create((-Offset).getSExtValue(), Ctx), Ctx);
+
+ uint64_t KeyID = CPA.getKey()->getZExtValue();
+ // We later rely on valid KeyID value in AArch64PACKeyIDToString call from
+ // AArch64AuthMCExpr::printImpl, so fail fast.
+ if (KeyID > AArch64PACKey::LAST)
+ report_fatal_error("AArch64 PAC Key ID '" + Twine(KeyID) +
+ "' out of range [0, " +
+ Twine((unsigned)AArch64PACKey::LAST) + "]");
+
+ uint64_t Disc = CPA.getDiscriminator()->getZExtValue();
+ if (!isUInt<16>(Disc))
+ report_fatal_error("AArch64 PAC Discriminator '" + Twine(Disc) +
+ "' out of range [0, 0xFFFF]");
+
+ // Finally build the complete @AUTH expr.
+ return AArch64AuthMCExpr::create(Sym, Disc, AArch64PACKey::ID(KeyID),
+ CPA.hasAddressDiscriminator(), Ctx);
+}
+
// Simple pseudo-instructions have their lowering (with expansion to real
// instructions) auto-generated.
#include "AArch64GenMCPseudoLowering.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 941990c53c4a..2f7e226fd09b 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -500,23 +500,31 @@ def CC_AArch64_Preserve_None : CallingConv<[
// - X8, used for sret
// - X16/X17, used by the linker as IP0/IP1
// - X18, the platform register
+ // - X19, the base pointer
// - X29, the frame pointer
// - X30, the link register
// General registers are not preserved with the exception of
// FP, LR, and X18
// Non-volatile registers are used first, so functions may call
// normal functions without saving and reloading arguments.
- CCIfType<[i32], CCAssignToReg<[W19, W20, W21, W22, W23,
+ // X9 is assigned last as it is used in FrameLowering as the first
+ // choice for a scratch register.
+ CCIfType<[i32], CCAssignToReg<[W20, W21, W22, W23,
W24, W25, W26, W27, W28,
W0, W1, W2, W3, W4, W5,
- W6, W7, W9, W10, W11,
- W12, W13, W14, W15]>>,
- CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23,
+ W6, W7, W10, W11,
+ W12, W13, W14, W9]>>,
+ CCIfType<[i64], CCAssignToReg<[X20, X21, X22, X23,
X24, X25, X26, X27, X28,
X0, X1, X2, X3, X4, X5,
- X6, X7, X9, X10, X11,
- X12, X13, X14, X15]>>,
-
+ X6, X7, X10, X11,
+ X12, X13, X14, X9]>>,
+
+ // Windows uses X15 for stack allocation
+ CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
+ CCIfType<[i32], CCAssignToReg<[W15]>>>,
+ CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
+ CCIfType<[i64], CCAssignToReg<[X15]>>>,
CCDelegateTo<CC_AArch64_AAPCS>
]>;
diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 3f244ba10102..154ae43b29d5 100644
--- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -50,7 +50,8 @@ struct LDTLSCleanup : public MachineFunctionPass {
return false;
}
- MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree *DT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
return VisitNode(DT->getRootNode(), 0);
}
@@ -138,7 +139,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 2a4a3c0df08f..68243258a68f 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -126,7 +126,7 @@ char AArch64ConditionOptimizer::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt",
"AArch64 CondOpt Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt",
"AArch64 CondOpt Pass", false, false)
@@ -135,8 +135,8 @@ FunctionPass *llvm::createAArch64ConditionOptimizerPass() {
}
void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -332,7 +332,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
return false;
TII = MF.getSubtarget().getInstrInfo();
- DomTree = &getAnalysis<MachineDominatorTree>();
+ DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MRI = &MF.getRegInfo();
bool Changed = false;
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 8c16a88a13a4..9a788123b1ff 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -795,7 +795,7 @@ char AArch64ConditionalCompares::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
"AArch64 CCMP Pass", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
"AArch64 CCMP Pass", false, false)
@@ -806,8 +806,8 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<MachineBranchProbabilityInfo>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
AU.addRequired<MachineTraceMetrics>();
@@ -933,7 +933,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
TRI = MF.getSubtarget().getRegisterInfo();
SchedModel = MF.getSubtarget().getSchedModel();
MRI = &MF.getRegInfo();
- DomTree = &getAnalysis<MachineDominatorTree>();
+ DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
Loops = &getAnalysis<MachineLoopInfo>();
MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
Traces = &getAnalysis<MachineTraceMetrics>();
diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td
new file mode 100644
index 000000000000..7a40c83b2bb2
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64FMV.td
@@ -0,0 +1,99 @@
+//=------ AArch64FMV.td - Describe AArch64 FMV Features ------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Function MultiVersioning (FMV) properties. FMV features are accepted by the
+// attributes target_version and target_clones, and they correspond to a mapping
+// from the FMV feature name to:
+// - A bit in the FMV ABI, as defined by the ACLE.
+// - The FMV priority, as defined by the ACLE.
+// - A list of backend features.
+//
+// The list of backend features is not a set of dependencies; it is specific to
+// LLVM and indicates how to do codegen when the FMV feature is present.
+//
+// Therefore FMVExtensions are separated from regular AArch64 Extensions, which
+// encode dependencies between themselves and other SubtargetFeatures.
+//===----------------------------------------------------------------------===//
+
+
+// Something you can add to target_version or target_clones.
+class FMVExtension<string n, string b, string f, int p> {
+ // Name, as spelled in target_version or target_clones. e.g. "memtag".
+ string Name = n;
+
+ // A C++ expression giving the number of the bit in the FMV ABI.
+ // Currently this is given as a value from the enum "CPUFeatures".
+ string Bit = b;
+
+ // SubtargetFeatures enabled for codegen when this FMV feature is present.
+ string BackendFeatures = f;
+
+ // The FMV priority.
+ int Priority = p;
+}
+
+def : FMVExtension<"aes", "FEAT_AES", "+fp-armv8,+neon", 150>;
+def : FMVExtension<"bf16", "FEAT_BF16", "+bf16", 280>;
+def : FMVExtension<"bti", "FEAT_BTI", "+bti", 510>;
+def : FMVExtension<"crc", "FEAT_CRC", "+crc", 110>;
+def : FMVExtension<"dgh", "FEAT_DGH", "", 260>;
+def : FMVExtension<"dit", "FEAT_DIT", "+dit", 180>;
+def : FMVExtension<"dotprod", "FEAT_DOTPROD", "+dotprod,+fp-armv8,+neon", 104>;
+def : FMVExtension<"dpb", "FEAT_DPB", "+ccpp", 190>;
+def : FMVExtension<"dpb2", "FEAT_DPB2", "+ccpp,+ccdp", 200>;
+def : FMVExtension<"ebf16", "FEAT_EBF16", "+bf16", 290>;
+def : FMVExtension<"f32mm", "FEAT_SVE_F32MM", "+sve,+f32mm,+fullfp16,+fp-armv8,+neon", 350>;
+def : FMVExtension<"f64mm", "FEAT_SVE_F64MM", "+sve,+f64mm,+fullfp16,+fp-armv8,+neon", 360>;
+def : FMVExtension<"fcma", "FEAT_FCMA", "+fp-armv8,+neon,+complxnum", 220>;
+def : FMVExtension<"flagm", "FEAT_FLAGM", "+flagm", 20>;
+def : FMVExtension<"flagm2", "FEAT_FLAGM2", "+flagm,+altnzcv", 30>;
+def : FMVExtension<"fp", "FEAT_FP", "+fp-armv8,+neon", 90>;
+def : FMVExtension<"fp16", "FEAT_FP16", "+fullfp16,+fp-armv8,+neon", 170>;
+def : FMVExtension<"fp16fml", "FEAT_FP16FML", "+fp16fml,+fullfp16,+fp-armv8,+neon", 175>;
+def : FMVExtension<"frintts", "FEAT_FRINTTS", "+fptoint", 250>;
+def : FMVExtension<"i8mm", "FEAT_I8MM", "+i8mm", 270>;
+def : FMVExtension<"jscvt", "FEAT_JSCVT", "+fp-armv8,+neon,+jsconv", 210>;
+def : FMVExtension<"ls64", "FEAT_LS64", "", 520>;
+def : FMVExtension<"ls64_accdata", "FEAT_LS64_ACCDATA", "+ls64", 540>;
+def : FMVExtension<"ls64_v", "FEAT_LS64_V", "", 530>;
+def : FMVExtension<"lse", "FEAT_LSE", "+lse", 80>;
+def : FMVExtension<"memtag", "FEAT_MEMTAG", "", 440>;
+def : FMVExtension<"memtag2", "FEAT_MEMTAG2", "+mte", 450>;
+def : FMVExtension<"memtag3", "FEAT_MEMTAG3", "+mte", 460>;
+def : FMVExtension<"mops", "FEAT_MOPS", "+mops", 650>;
+def : FMVExtension<"pmull", "FEAT_PMULL", "+aes,+fp-armv8,+neon", 160>;
+def : FMVExtension<"predres", "FEAT_PREDRES", "+predres", 480>;
+def : FMVExtension<"rcpc", "FEAT_RCPC", "+rcpc", 230>;
+def : FMVExtension<"rcpc2", "FEAT_RCPC2", "+rcpc", 240>;
+def : FMVExtension<"rcpc3", "FEAT_RCPC3", "+rcpc,+rcpc3", 241>;
+def : FMVExtension<"rdm", "FEAT_RDM", "+rdm,+fp-armv8,+neon", 108>;
+def : FMVExtension<"rng", "FEAT_RNG", "+rand", 10>;
+def : FMVExtension<"rpres", "FEAT_RPRES", "", 300>;
+def : FMVExtension<"sb", "FEAT_SB", "+sb", 470>;
+def : FMVExtension<"sha1", "FEAT_SHA1", "+fp-armv8,+neon", 120>;
+def : FMVExtension<"sha2", "FEAT_SHA2", "+sha2,+fp-armv8,+neon", 130>;
+def : FMVExtension<"sha3", "FEAT_SHA3", "+sha3,+sha2,+fp-armv8,+neon", 140>;
+def : FMVExtension<"simd", "FEAT_SIMD", "+fp-armv8,+neon", 100>;
+def : FMVExtension<"sm4", "FEAT_SM4", "+sm4,+fp-armv8,+neon", 106>;
+def : FMVExtension<"sme", "FEAT_SME", "+sme,+bf16", 430>;
+def : FMVExtension<"sme-f64f64", "FEAT_SME_F64", "+sme,+sme-f64f64,+bf16", 560>;
+def : FMVExtension<"sme-i16i64", "FEAT_SME_I64", "+sme,+sme-i16i64,+bf16", 570>;
+def : FMVExtension<"sme2", "FEAT_SME2", "+sme2,+sme,+bf16", 580>;
+def : FMVExtension<"ssbs", "FEAT_SSBS", "", 490>;
+def : FMVExtension<"ssbs2", "FEAT_SSBS2", "+ssbs", 500>;
+def : FMVExtension<"sve", "FEAT_SVE", "+sve,+fullfp16,+fp-armv8,+neon", 310>;
+def : FMVExtension<"sve-bf16", "FEAT_SVE_BF16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320>;
+def : FMVExtension<"sve-ebf16", "FEAT_SVE_EBF16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 330>;
+def : FMVExtension<"sve-i8mm", "FEAT_SVE_I8MM", "+sve,+i8mm,+fullfp16,+fp-armv8,+neon", 340>;
+def : FMVExtension<"sve2", "FEAT_SVE2", "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370>;
+def : FMVExtension<"sve2-aes", "FEAT_SVE_AES", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380>;
+def : FMVExtension<"sve2-bitperm", "FEAT_SVE_BITPERM", "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400>;
+def : FMVExtension<"sve2-pmull128", "FEAT_SVE_PMULL128", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 390>;
+def : FMVExtension<"sve2-sha3", "FEAT_SVE_SHA3", "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410>;
+def : FMVExtension<"sve2-sm4", "FEAT_SVE_SM4", "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420>;
+def : FMVExtension<"wfxt", "FEAT_WFXT", "+wfxt", 550>;
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index ffb899a30145..8c1003c085e6 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -11,24 +11,11 @@
// A SubtargetFeature that can be toggled from the command line, and therefore
// has an AEK_* entry in ArmExtKind.
-//
-// If Function MultiVersioning (FMV) properties are left at their defaults
-// (FEAT_INIT, no dependencies, priority 0) it indiates that this extension is
-// not an FMV feature, but can be enabled via the command line (-march, -mcpu,
-// etc).
-//
-// Conversely if the ArchExtKindSpelling is set to AEK_NONE, this indicates
-// that a feature is FMV-only, and can not be selected on the command line.
-// Such extensions should be added via FMVOnlyExtension.
class Extension<
string TargetFeatureName, // String used for -target-feature and -march, unless overridden.
string Spelling, // The XYZ in HasXYZ and AEK_XYZ.
string Desc, // Description.
- list<SubtargetFeature> Implies = [], // List of dependent features.
- // FMV properties
- string _FMVBit = "FEAT_INIT", // FEAT_INIT is repurposed to indicate "not an FMV feature"
- string _FMVDependencies = "",
- int _FMVPriority = 0
+ list<SubtargetFeature> Implies = [] // List of dependent features.
> : SubtargetFeature<TargetFeatureName, "Has" # Spelling, "true", Desc, Implies>
{
string ArchExtKindSpelling = "AEK_" # Spelling; // ArchExtKind enum name.
@@ -42,57 +29,9 @@ class Extension<
// An alias that can be used on the command line, if the extension has one.
// Used for correcting historical names while remaining backwards compatible.
string MArchAlias = "";
-
- // Function MultiVersioning (FMV) properties
-
- // A C++ expression giving the number of the bit in the FMV ABI.
- // Currently this is given as a value from the enum "CPUFeatures".
- // If this is not set, it indicates that this is not an FMV extension.
- string FMVBit = _FMVBit;
-
- // List of features that this feature depends on.
- // FIXME generate this from Implies.
- string FMVDependencies = _FMVDependencies;
-
- // The FMV priority
- int FMVPriority = _FMVPriority;
}
-// Some extensions are available for FMV but can not be controlled via the
-// command line. These entries:
-// - are SubtargetFeatures, so they have (unused) FieldNames on the subtarget
-// e.g. HasFMVOnlyFEAT_XYZ
-// - have incorrect (empty) Implies fields, because the code that handles FMV
-// ignores these dependencies and looks only at FMVDependencies.
-// - have no description.
-//
-// In the generated data structures for extensions (ExtensionInfo), AEK_NONE is
-// used to indicate that a feature is FMV only. Therefore ArchExtKindSpelling is
-// manually overridden here.
-class FMVOnlyExtension<string FMVBit, string Name, string Deps, int Priority>
- : Extension<Name, "FMVOnly"#FMVBit, "", [], FMVBit, Deps, Priority> {
- let ArchExtKindSpelling = "AEK_NONE"; // AEK_NONE indicates FMV-only feature
-}
-def : FMVOnlyExtension<"FEAT_DGH", "dgh", "", 260>;
-def : FMVOnlyExtension<"FEAT_DPB", "dpb", "+ccpp", 190>;
-def : FMVOnlyExtension<"FEAT_DPB2", "dpb2", "+ccpp,+ccdp", 200>;
-def : FMVOnlyExtension<"FEAT_EBF16", "ebf16", "+bf16", 290>;
-def : FMVOnlyExtension<"FEAT_FLAGM2", "flagm2", "+flagm,+altnzcv", 30>;
-def : FMVOnlyExtension<"FEAT_FRINTTS", "frintts", "+fptoint", 250>;
-def : FMVOnlyExtension<"FEAT_LS64_ACCDATA", "ls64_accdata", "+ls64", 540>;
-def : FMVOnlyExtension<"FEAT_LS64_V", "ls64_v", "", 530>;
-def : FMVOnlyExtension<"FEAT_MEMTAG2", "memtag2", "+mte", 450>;
-def : FMVOnlyExtension<"FEAT_MEMTAG3", "memtag3", "+mte", 460>;
-def : FMVOnlyExtension<"FEAT_PMULL", "pmull", "+aes,+fp-armv8,+neon", 160>;
-def : FMVOnlyExtension<"FEAT_RCPC2", "rcpc2", "+rcpc", 240>;
-def : FMVOnlyExtension<"FEAT_RPRES", "rpres", "", 300>;
-def : FMVOnlyExtension<"FEAT_SHA1", "sha1", "+fp-armv8,+neon", 120>;
-def : FMVOnlyExtension<"FEAT_SSBS2", "ssbs2", "+ssbs", 500>;
-def : FMVOnlyExtension<"FEAT_SVE_BF16", "sve-bf16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320>;
-def : FMVOnlyExtension<"FEAT_SVE_EBF16", "sve-ebf16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 330>;
-def : FMVOnlyExtension<"FEAT_SVE_I8MM", "sve-i8mm", "+sve,+i8mm,+fullfp16,+fp-armv8,+neon", 340>;
-def : FMVOnlyExtension<"FEAT_SVE_PMULL128", "sve2-pmull128", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 390>;
// Each SubtargetFeature which corresponds to an Arm Architecture feature should
@@ -101,35 +40,26 @@ def : FMVOnlyExtension<"FEAT_SVE_PMULL128", "sve2-pmull128", "+sve2,+sve,+sve2-a
// Arm Architecture Features, it should list all the relevant features. Not all
// FEAT_ features have a corresponding SubtargetFeature.
+
+//===----------------------------------------------------------------------===//
+// Armv8.0 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
let ArchExtKindSpelling = "AEK_FP", MArchName = "fp" in
def FeatureFPARMv8 : Extension<"fp-armv8", "FPARMv8",
- "Enable ARMv8 (FEAT_FP)", [],
- "FEAT_FP", "+fp-armv8,+neon", 90>;
+ "Enable ARMv8 (FEAT_FP)">;
let ArchExtKindSpelling = "AEK_SIMD", MArchName = "simd" in
def FeatureNEON : Extension<"neon", "NEON",
- "Enable Advanced SIMD instructions (FEAT_AdvSIMD)", [FeatureFPARMv8],
- "FEAT_SIMD", "+fp-armv8,+neon", 100>;
-
-def FeatureSM4 : Extension<
- "sm4", "SM4",
- "Enable SM3 and SM4 support (FEAT_SM4, FEAT_SM3)", [FeatureNEON],
- "FEAT_SM4", "+sm4,+fp-armv8,+neon", 106>;
+ "Enable Advanced SIMD instructions (FEAT_AdvSIMD)", [FeatureFPARMv8]>;
def FeatureSHA2 : Extension<
"sha2", "SHA2",
- "Enable SHA1 and SHA256 support (FEAT_SHA1, FEAT_SHA256)", [FeatureNEON],
- "FEAT_SHA2", "+sha2,+fp-armv8,+neon", 130>;
-
-def FeatureSHA3 : Extension<
- "sha3", "SHA3",
- "Enable SHA512 and SHA3 support (FEAT_SHA3, FEAT_SHA512)", [FeatureNEON, FeatureSHA2],
- "FEAT_SHA3", "+sha3,+sha2,+fp-armv8,+neon", 140>;
+ "Enable SHA1 and SHA256 support (FEAT_SHA1, FEAT_SHA256)", [FeatureNEON]>;
def FeatureAES : Extension<
"aes", "AES",
- "Enable AES support (FEAT_AES, FEAT_PMULL)", [FeatureNEON],
- "FEAT_AES", "+fp-armv8,+neon", 150>;
+ "Enable AES support (FEAT_AES, FEAT_PMULL)", [FeatureNEON]>;
// Crypto has been split up and any combination is now valid (see the
// crypto definitions above). Also, crypto is now context sensitive:
@@ -139,39 +69,33 @@ def FeatureAES : Extension<
// meaning anymore. We kept the Crypto definition here for backward
// compatibility, and now imply features SHA2 and AES, which was the
// "traditional" meaning of Crypto.
-let FMVDependencies = "+aes,+sha2" in
def FeatureCrypto : Extension<"crypto", "Crypto",
"Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>;
def FeatureCRC : Extension<"crc", "CRC",
- "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)", [],
- "FEAT_CRC", "+crc", 110>;
+ "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)">;
-def FeatureRAS : Extension<"ras", "RAS",
- "Enable ARMv8 Reliability, Availability and Serviceability Extensions (FEAT_RAS, FEAT_RASv1p1)">;
-
-def FeatureRASv2 : Extension<"rasv2", "RASv2",
- "Enable ARMv8.9-A Reliability, Availability and Serviceability Extensions (FEAT_RASv2)",
- [FeatureRAS]>;
-
-def FeatureLSE : Extension<"lse", "LSE",
- "Enable ARMv8.1 Large System Extension (LSE) atomic instructions (FEAT_LSE)", [],
- "FEAT_LSE", "+lse", 80>;
+// This SubtargetFeature is special. It controls only whether codegen will turn
+// `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The
+// `FEAT_PMUv3*` system registers are always available for assembly/disassembly.
+let MArchName = "pmuv3" in
+def FeaturePerfMon : Extension<"perfmon", "PerfMon",
+ "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension (FEAT_PMUv3)">;
-def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
- "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules (FEAT_LSE2)">;
+def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict",
+ "true", "Enable architectural speculation restriction (FEAT_CSV2_2)">;
-def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
- "Enable out of line atomics to support LSE instructions">;
+//===----------------------------------------------------------------------===//
+// Armv8.1 Architecture Extensions
+//===----------------------------------------------------------------------===//
-def FeatureFMV : SubtargetFeature<"fmv", "HasFMV", "true",
- "Enable Function Multi Versioning support.">;
+def FeatureLSE : Extension<"lse", "LSE",
+ "Enable ARMv8.1 Large System Extension (LSE) atomic instructions (FEAT_LSE)">;
let MArchAlias = "rdma" in
def FeatureRDM : Extension<"rdm", "RDM",
"Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions (FEAT_RDM)",
- [FeatureNEON],
- "FEAT_RDM", "+rdm,+fp-armv8,+neon", 108>;
+ [FeatureNEON]>;
def FeaturePAN : SubtargetFeature<
"pan", "HasPAN", "true",
@@ -187,21 +111,24 @@ def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2",
def FeatureVH : SubtargetFeature<"vh", "HasVH", "true",
"Enables ARM v8.1 Virtual Host extension (FEAT_VHE)", [FeatureCONTEXTIDREL2] >;
-// This SubtargetFeature is special. It controls only whether codegen will turn
-// `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The
-// `FEAT_PMUv3*` system registers are always available for assembly/disassembly.
-let MArchName = "pmuv3" in
-def FeaturePerfMon : Extension<"perfmon", "PerfMon",
- "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension (FEAT_PMUv3)">;
+//===----------------------------------------------------------------------===//
+// Armv8.2 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureSM4 : Extension<
+ "sm4", "SM4",
+ "Enable SM3 and SM4 support (FEAT_SM4, FEAT_SM3)", [FeatureNEON]>;
+
+def FeatureSHA3 : Extension<
+ "sha3", "SHA3",
+ "Enable SHA512 and SHA3 support (FEAT_SHA3, FEAT_SHA512)", [FeatureNEON, FeatureSHA2]>;
+
+def FeatureRAS : Extension<"ras", "RAS",
+ "Enable ARMv8 Reliability, Availability and Serviceability Extensions (FEAT_RAS, FEAT_RASv1p1)">;
let ArchExtKindSpelling = "AEK_FP16", MArchName = "fp16" in
def FeatureFullFP16 : Extension<"fullfp16", "FullFP16",
- "Full FP16 (FEAT_FP16)", [FeatureFPARMv8],
- "FEAT_FP16", "+fullfp16,+fp-armv8,+neon", 170>;
-
-def FeatureFP16FML : Extension<"fp16fml", "FP16FML",
- "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16],
- "FEAT_FP16FML", "+fp16fml,+fullfp16,+fp-armv8,+neon", 175>;
+ "Full FP16 (FEAT_FP16)", [FeatureFPARMv8]>;
let ArchExtKindSpelling = "AEK_PROFILE", MArchName = "profile" in
def FeatureSPE : Extension<"spe", "SPE",
@@ -212,7 +139,6 @@ def FeaturePAN_RWV : SubtargetFeature<
"Enable v8.2 PAN s1e1R and s1e1W Variants (FEAT_PAN2)",
[FeaturePAN]>;
-// UAO PState
def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true",
"Enable v8.2 UAO PState (FEAT_UAO)">;
@@ -220,66 +146,400 @@ def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
"true", "Enable v8.2 data Cache Clean to Point of Persistence (FEAT_DPB)" >;
def FeatureSVE : Extension<"sve", "SVE",
- "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16],
- "FEAT_SVE", "+sve,+fullfp16,+fp-armv8,+neon", 310>;
+ "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16]>;
-// This flag is currently still labeled as Experimental, but when fully
-// implemented this should tell the compiler to use the zeroing pseudos to
-// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive
-// lanes are known to be zero. The pseudos will then be expanded using the
-// MOVPRFX instruction to zero the inactive lanes. This feature should only be
-// enabled if MOVPRFX instructions are known to merge with the destructive
-// operations they prefix.
-//
-// This feature could similarly be extended to support cheap merging of _any_
-// value into the inactive lanes using the MOVPRFX instruction that uses
-// merging-predication.
-def FeatureExperimentalZeroingPseudos
- : SubtargetFeature<"use-experimental-zeroing-pseudos",
- "UseExperimentalZeroingPseudos", "true",
- "Hint to the compiler that the MOVPRFX instruction is "
- "merged with destructive operations",
- []>;
+let ArchExtKindSpelling = "AEK_I8MM" in
+def FeatureMatMulInt8 : Extension<"i8mm", "MatMulInt8",
+ "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)", []>;
-def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
- "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
+let ArchExtKindSpelling = "AEK_F32MM" in
+def FeatureMatMulFP32 : Extension<"f32mm", "MatMulFP32",
+ "Enable Matrix Multiply FP32 Extension (FEAT_F32MM)", [FeatureSVE]>;
+
+let ArchExtKindSpelling = "AEK_F64MM" in
+def FeatureMatMulFP64 : Extension<"f64mm", "MatMulFP64",
+ "Enable Matrix Multiply FP64 Extension (FEAT_F64MM)", [FeatureSVE]>;
+
+//===----------------------------------------------------------------------===//
+// Armv8.3 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureRCPC : Extension<"rcpc", "RCPC",
+ "Enable support for RCPC extension (FEAT_LRCPC)", []>;
+
+def FeaturePAuth : Extension<
+ "pauth", "PAuth",
+ "Enable v8.3-A Pointer Authentication extension (FEAT_PAuth)">;
+
+let ArchExtKindSpelling = "AEK_JSCVT", MArchName = "jscvt" in
+def FeatureJS : Extension<
+ "jsconv", "JS",
+ "Enable v8.3-A JavaScript FP conversion instructions (FEAT_JSCVT)",
+ [FeatureFPARMv8]>;
+
+def FeatureCCIDX : SubtargetFeature<
+ "ccidx", "HasCCIDX", "true",
+ "Enable v8.3-A Extend of the CCSIDR number of sets (FEAT_CCIDX)">;
+
+let ArchExtKindSpelling = "AEK_FCMA", MArchName = "fcma" in
+def FeatureComplxNum : Extension<
+ "complxnum", "ComplxNum",
+ "Enable v8.3-A Floating-point complex number support (FEAT_FCMA)",
+ [FeatureNEON]>;
+
+def FeatureNV : SubtargetFeature<
+ "nv", "HasNV", "true",
+ "Enable v8.4-A Nested Virtualization Enchancement (FEAT_NV, FEAT_NV2)">;
+
+//===----------------------------------------------------------------------===//
+// Armv8.4 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
+ "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules (FEAT_LSE2)">;
+
+def FeatureFP16FML : Extension<"fp16fml", "FP16FML",
+ "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16]>;
+
+def FeatureDotProd : Extension<
+ "dotprod", "DotProd",
+ "Enable dot product support (FEAT_DotProd)", [FeatureNEON]>;
+
+def FeatureMPAM : SubtargetFeature<
+ "mpam", "HasMPAM", "true",
+ "Enable v8.4-A Memory system Partitioning and Monitoring extension (FEAT_MPAM)">;
+
+def FeatureDIT : SubtargetFeature<
+ "dit", "HasDIT", "true",
+ "Enable v8.4-A Data Independent Timing instructions (FEAT_DIT)">;
+
+def FeatureTRACEV8_4 : SubtargetFeature<
+ "tracev8.4", "HasTRACEV8_4", "true",
+ "Enable v8.4-A Trace extension (FEAT_TRF)">;
+
+def FeatureAM : SubtargetFeature<
+ "am", "HasAM", "true",
+ "Enable v8.4-A Activity Monitors extension (FEAT_AMUv1)">;
+
+def FeatureSEL2 : SubtargetFeature<
+ "sel2", "HasSEL2", "true",
+ "Enable v8.4-A Secure Exception Level 2 extension (FEAT_SEL2)">;
+
+def FeatureTLB_RMI : SubtargetFeature<
+ "tlb-rmi", "HasTLB_RMI", "true",
+ "Enable v8.4-A TLB Range and Maintenance Instructions (FEAT_TLBIOS, FEAT_TLBIRANGE)">;
+
+def FeatureFlagM : Extension<
+ "flagm", "FlagM",
+ "Enable v8.4-A Flag Manipulation Instructions (FEAT_FlagM)", []>;
+
+def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true",
+ "Enable v8.4-A RCPC instructions with Immediate Offsets (FEAT_LRCPC2)",
+ [FeatureRCPC]>;
+
+//===----------------------------------------------------------------------===//
+// Armv8.5 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureAltFPCmp : SubtargetFeature<"altnzcv", "HasAlternativeNZCV", "true",
+ "Enable alternative NZCV format for floating point comparisons (FEAT_FlagM2)">;
+
+def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true",
+ "Enable FRInt[32|64][Z|X] instructions that round a floating-point number to "
+ "an integer (in FP format) forcing it to fit into a 32- or 64-bit int (FEAT_FRINTTS)" >;
+
+def FeatureSB : Extension<"sb", "SB",
+ "Enable v8.5 Speculation Barrier (FEAT_SB)", []>;
+
+def FeatureSSBS : Extension<"ssbs", "SSBS",
+ "Enable Speculative Store Bypass Safe bit (FEAT_SSBS, FEAT_SSBS2)", []>;
+
+def FeaturePredRes : Extension<"predres", "PredRes",
+ "Enable v8.5a execution and data prediction invalidation instructions (FEAT_SPECRES)", []>;
+
+def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "CCDP", "true",
+ "Enable v8.5 Cache Clean to Point of Deep Persistence (FEAT_DPB2)" >;
+
+def FeatureBranchTargetId : SubtargetFeature<"bti", "BTI", "true",
+ "Enable Branch Target Identification (FEAT_BTI)">;
+
+let ArchExtKindSpelling = "AEK_RAND", MArchName = "rng" in
+def FeatureRandGen : Extension<"rand", "RandGen",
+ "Enable Random Number generation instructions (FEAT_RNG)", []>;
+
+// NOTE: "memtag" means FEAT_MTE + FEAT_MTE2 for -march or
+// __attribute((target(...))), but only FEAT_MTE for FMV.
+let MArchName = "memtag" in
+def FeatureMTE : Extension<"mte", "MTE",
+ "Enable Memory Tagging Extension (FEAT_MTE, FEAT_MTE2)", []>;
+
+//===----------------------------------------------------------------------===//
+// Armv8.6 Architecture Extensions
+//===----------------------------------------------------------------------===//
def FeatureBF16 : Extension<"bf16", "BF16",
- "Enable BFloat16 Extension (FEAT_BF16)", [],
- "FEAT_BF16", "+bf16", 280>;
+ "Enable BFloat16 Extension (FEAT_BF16)">;
-def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
- "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">;
+def FeatureAMVS : SubtargetFeature<
+ "amvs", "HasAMVS", "true",
+ "Enable v8.6-A Activity Monitors Virtualization support (FEAT_AMUv1p1)",
+ [FeatureAM]>;
+
+def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
+ "true", "Enable fine grained virtualization traps extension (FEAT_FGT)">;
+
+def FeatureEnhancedCounterVirtualization :
+ SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
+ "true", "Enable enhanced counter virtualization extension (FEAT_ECV)">;
+
+//===----------------------------------------------------------------------===//
+// Armv8.7 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureXS : SubtargetFeature<"xs", "HasXS",
+ "true", "Enable Armv8.7-A limited-TLB-maintenance instruction (FEAT_XS)">;
+
+def FeatureWFxT : SubtargetFeature<"wfxt", "WFxT", "true",
+ "Enable Armv8.7-A WFET and WFIT instruction (FEAT_WFxT)">;
+
+def FeatureHCX : SubtargetFeature<
+ "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register (FEAT_HCX)">;
+
+def FeatureLS64 : Extension<"ls64", "LS64",
+ "Enable Armv8.7-A LD64B/ST64B Accelerator Extension (FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA)", []>;
+
+def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF",
+ "true", "Enable extra register in the Statistical Profiling Extension (FEAT_SPEv1p2)">;
+
+//===----------------------------------------------------------------------===//
+// Armv8.8 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureHBC : Extension<"hbc", "HBC",
+ "Enable Armv8.8-A Hinted Conditional Branches Extension (FEAT_HBC)">;
+
+def FeatureMOPS : Extension<"mops", "MOPS",
+ "Enable Armv8.8-A memcpy and memset acceleration instructions (FEAT_MOPS)", []>;
+
+def FeatureNMI : SubtargetFeature<"nmi", "HasNMI",
+ "true", "Enable Armv8.8-A Non-maskable Interrupts (FEAT_NMI, FEAT_GICv3_NMI)">;
+
+//===----------------------------------------------------------------------===//
+// Armv8.9 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureRASv2 : Extension<"rasv2", "RASv2",
+ "Enable ARMv8.9-A Reliability, Availability and Serviceability Extensions (FEAT_RASv2)",
+ [FeatureRAS]>;
+
+def FeatureCSSC : Extension<"cssc", "CSSC",
+ "Enable Common Short Sequence Compression (CSSC) instructions (FEAT_CSSC)">;
+
+def FeatureCLRBHB : SubtargetFeature<"clrbhb", "HasCLRBHB",
+ "true", "Enable Clear BHB instruction (FEAT_CLRBHB)">;
+
+def FeaturePRFM_SLC : SubtargetFeature<"prfm-slc-target", "HasPRFM_SLC",
+ "true", "Enable SLC target for PRFM instruction">;
+
+let MArchName = "predres2" in
+def FeatureSPECRES2 : Extension<"specres2", "SPECRES2",
+ "Enable Speculation Restriction Instruction (FEAT_SPECRES2)",
+ [FeaturePredRes]>;
+
+def FeatureRCPC3 : Extension<"rcpc3", "RCPC3",
+ "Enable Armv8.9-A RCPC instructions for A64 and Advanced SIMD and floating-point instruction set (FEAT_LRCPC3)",
+ [FeatureRCPC_IMMO]>;
+
+def FeatureTHE : Extension<"the", "THE",
+ "Enable Armv8.9-A Translation Hardening Extension (FEAT_THE)">;
+
+//===----------------------------------------------------------------------===//
+// Armv9.0 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
+ "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
def FeatureSVE2 : Extension<"sve2", "SVE2",
"Enable Scalable Vector Extension 2 (SVE2) instructions (FEAT_SVE2)",
- [FeatureSVE, FeatureUseScalarIncVL],
- "FEAT_SVE2", "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370>;
+ [FeatureSVE, FeatureUseScalarIncVL]>;
def FeatureSVE2AES : Extension<"sve2-aes", "SVE2AES",
"Enable AES SVE2 instructions (FEAT_SVE_AES, FEAT_SVE_PMULL128)",
- [FeatureSVE2, FeatureAES],
- "FEAT_SVE_AES", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380>;
+ [FeatureSVE2, FeatureAES]>;
def FeatureSVE2SM4 : Extension<"sve2-sm4", "SVE2SM4",
- "Enable SM4 SVE2 instructions (FEAT_SVE_SM4)", [FeatureSVE2, FeatureSM4],
- "FEAT_SVE_SM4", "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420>;
+ "Enable SM4 SVE2 instructions (FEAT_SVE_SM4)", [FeatureSVE2, FeatureSM4]>;
def FeatureSVE2SHA3 : Extension<"sve2-sha3", "SVE2SHA3",
- "Enable SHA3 SVE2 instructions (FEAT_SVE_SHA3)", [FeatureSVE2, FeatureSHA3],
- "FEAT_SVE_SHA3", "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410>;
+ "Enable SHA3 SVE2 instructions (FEAT_SVE_SHA3)", [FeatureSVE2, FeatureSHA3]>;
def FeatureSVE2BitPerm : Extension<"sve2-bitperm", "SVE2BitPerm",
- "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2],
- "FEAT_SVE_BITPERM", "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400>;
+ "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2]>;
+
+def FeatureTRBE : SubtargetFeature<"trbe", "TRBE", "true",
+ "Enable Trace Buffer Extension (FEAT_TRBE)">;
+
+def FeatureETE : SubtargetFeature<"ete", "ETE", "true",
+ "Enable Embedded Trace Extension (FEAT_ETE)",
+ [FeatureTRBE]>;
+
+def FeatureTME : Extension<"tme", "TME",
+ "Enable Transactional Memory Extension (FEAT_TME)" >;
+
+//===----------------------------------------------------------------------===//
+// Armv9.1 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Armv9.2 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureBRBE : Extension<"brbe", "BRBE",
+ "Enable Branch Record Buffer Extension (FEAT_BRBE)">;
+
+def FeatureRME : SubtargetFeature<"rme", "HasRME",
+ "true", "Enable Realm Management Extension (FEAT_RME)">;
+
+def FeatureSME : Extension<"sme", "SME",
+ "Enable Scalable Matrix Extension (SME) (FEAT_SME)", [FeatureBF16, FeatureUseScalarIncVL]>;
+
+def FeatureSMEF64F64 : Extension<"sme-f64f64", "SMEF64F64",
+ "Enable Scalable Matrix Extension (SME) F64F64 instructions (FEAT_SME_F64F64)", [FeatureSME]>;
+
+def FeatureSMEI16I64 : Extension<"sme-i16i64", "SMEI16I64",
+ "Enable Scalable Matrix Extension (SME) I16I64 instructions (FEAT_SME_I16I64)", [FeatureSME]>;
+
+def FeatureSMEFA64 : Extension<"sme-fa64", "SMEFA64",
+ "Enable the full A64 instruction set in streaming SVE mode (FEAT_SME_FA64)", [FeatureSME, FeatureSVE2]>;
+
+//===----------------------------------------------------------------------===//
+// Armv9.3 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureSME2 : Extension<"sme2", "SME2",
+ "Enable Scalable Matrix Extension 2 (SME2) instructions", [FeatureSME]>;
+
+def FeatureMEC : SubtargetFeature<"mec", "HasMEC",
+ "true", "Enable Memory Encryption Contexts Extension", [FeatureRME]>;
+
+//===----------------------------------------------------------------------===//
+// Armv9.4 Architecture Extensions
+//===----------------------------------------------------------------------===//
-let FMVDependencies = "+sve2p1,+sve2,+sve,+fullfp16,+fp-armv8,+neon" in
def FeatureSVE2p1: Extension<"sve2p1", "SVE2p1",
"Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>;
def FeatureB16B16 : Extension<"b16b16", "B16B16",
"Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions (FEAT_B16B16)", [FeatureBF16]>;
+def FeatureSMEF16F16 : Extension<"sme-f16f16", "SMEF16F16",
+ "Enable SME non-widening Float16 instructions (FEAT_SME_F16F16)", [FeatureSME2]>;
+
+def FeatureSME2p1 : Extension<"sme2p1", "SME2p1",
+ "Enable Scalable Matrix Extension 2.1 (FEAT_SME2p1) instructions", [FeatureSME2]>;
+
+def FeatureCHK : SubtargetFeature<"chk", "HasCHK",
+ "true", "Enable Armv8.0-A Check Feature Status Extension (FEAT_CHK)">;
+
+def FeatureGCS : Extension<"gcs", "GCS",
+ "Enable Armv9.4-A Guarded Call Stack Extension", [FeatureCHK]>;
+
+def FeatureITE : Extension<"ite", "ITE",
+ "Enable Armv9.4-A Instrumentation Extension FEAT_ITE", [FeatureETE,
+ FeatureTRBE]>;
+
+def FeatureLSE128 : Extension<"lse128", "LSE128",
+ "Enable Armv9.4-A 128-bit Atomic Instructions (FEAT_LSE128)",
+ [FeatureLSE]>;
+
+// FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, and FEAT_SYSINSTR128 are mutually implicit.
+// Therefore group them all under a single feature flag, d128:
+def FeatureD128 : Extension<"d128", "D128",
+ "Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers "
+ "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)",
+ [FeatureLSE128]>;
+
+//===----------------------------------------------------------------------===//
+// Armv9.5 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureFAMINMAX: Extension<"faminmax", "FAMINMAX",
+ "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">;
+
+def FeatureLUT: Extension<"lut", "LUT",
+ "Enable Lookup Table instructions (FEAT_LUT)">;
+
+def FeatureFP8 : Extension<"fp8", "FP8",
+ "Enable FP8 instructions (FEAT_FP8)", [FeatureFAMINMAX, FeatureLUT, FeatureBF16]>;
+
+def FeatureFP8FMA : Extension<"fp8fma", "FP8FMA",
+ "Enable fp8 multiply-add instructions (FEAT_FP8FMA)", [FeatureFP8]>;
+
+def FeatureSSVE_FP8FMA : Extension<"ssve-fp8fma", "SSVE_FP8FMA",
+ "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2, FeatureFP8]>;
+
+def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4",
+ "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)", [FeatureFP8FMA]>;
+
+def FeatureFP8DOT2: Extension<"fp8dot2", "FP8DOT2",
+ "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)", [FeatureFP8DOT4]>;
+
+def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4",
+ "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSSVE_FP8FMA]>;
+
+def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2",
+ "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSSVE_FP8DOT4]>;
+
+def FeatureSME_LUTv2 : Extension<"sme-lutv2", "SME_LUTv2",
+ "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">;
+
+def FeatureSMEF8F32 : Extension<"sme-f8f32", "SMEF8F32",
+ "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>;
+
+def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16",
+ "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSMEF8F32]>;
+
+def FeatureCPA : Extension<"cpa", "CPA",
+ "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
+
+def FeaturePAuthLR : Extension<"pauth-lr", "PAuthLR",
+ "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
+
+def FeatureTLBIW : Extension<"tlbiw", "TLBIW",
+ "Enable ARMv9.5-A TLBI VMALL for Dirty State (FEAT_TLBIW)">;
+
+//===----------------------------------------------------------------------===//
+// Other Features
+//===----------------------------------------------------------------------===//
+
+def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
+ "Enable out of line atomics to support LSE instructions">;
+
+def FeatureFMV : SubtargetFeature<"fmv", "HasFMV", "true",
+ "Enable Function Multi Versioning support.">;
+
+// This flag is currently still labeled as Experimental, but when fully
+// implemented this should tell the compiler to use the zeroing pseudos to
+// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive
+// lanes are known to be zero. The pseudos will then be expanded using the
+// MOVPRFX instruction to zero the inactive lanes. This feature should only be
+// enabled if MOVPRFX instructions are known to merge with the destructive
+// operations they prefix.
+//
+// This feature could similarly be extended to support cheap merging of _any_
+// value into the inactive lanes using the MOVPRFX instruction that uses
+// merging-predication.
+def FeatureExperimentalZeroingPseudos
+ : SubtargetFeature<"use-experimental-zeroing-pseudos",
+ "UseExperimentalZeroingPseudos", "true",
+ "Hint to the compiler that the MOVPRFX instruction is "
+ "merged with destructive operations",
+ []>;
+
+def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
+ "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">;
+
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
@@ -409,85 +669,10 @@ def FeatureForce32BitJumpTables
: SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true",
"Force jump table entries to be 32-bits wide except at MinSize">;
-def FeatureRCPC : Extension<"rcpc", "RCPC",
- "Enable support for RCPC extension (FEAT_LRCPC)", [],
- "FEAT_RCPC", "+rcpc", 230>;
-
def FeatureUseRSqrt : SubtargetFeature<
"use-reciprocal-square-root", "UseRSqrt", "true",
"Use the reciprocal square root approximation">;
-def FeatureDotProd : Extension<
- "dotprod", "DotProd",
- "Enable dot product support (FEAT_DotProd)", [FeatureNEON],
- "FEAT_DOTPROD", "+dotprod,+fp-armv8,+neon", 104>;
-
-def FeaturePAuth : Extension<
- "pauth", "PAuth",
- "Enable v8.3-A Pointer Authentication extension (FEAT_PAuth)">;
-
-let ArchExtKindSpelling = "AEK_JSCVT", MArchName = "jscvt" in
-def FeatureJS : Extension<
- "jsconv", "JS",
- "Enable v8.3-A JavaScript FP conversion instructions (FEAT_JSCVT)",
- [FeatureFPARMv8],
- "FEAT_JSCVT", "+fp-armv8,+neon,+jsconv", 210>;
-
-def FeatureCCIDX : SubtargetFeature<
- "ccidx", "HasCCIDX", "true",
- "Enable v8.3-A Extend of the CCSIDR number of sets (FEAT_CCIDX)">;
-
-let ArchExtKindSpelling = "AEK_FCMA", MArchName = "fcma" in
-def FeatureComplxNum : Extension<
- "complxnum", "ComplxNum",
- "Enable v8.3-A Floating-point complex number support (FEAT_FCMA)",
- [FeatureNEON],
- "FEAT_FCMA", "+fp-armv8,+neon,+complxnum", 220>;
-
-def FeatureNV : SubtargetFeature<
- "nv", "HasNV", "true",
- "Enable v8.4-A Nested Virtualization Enchancement (FEAT_NV, FEAT_NV2)">;
-
-def FeatureMPAM : SubtargetFeature<
- "mpam", "HasMPAM", "true",
- "Enable v8.4-A Memory system Partitioning and Monitoring extension (FEAT_MPAM)">;
-
-def FeatureDIT : Extension<
- "dit", "DIT",
- "Enable v8.4-A Data Independent Timing instructions (FEAT_DIT)", [],
- "FEAT_DIT", "+dit", 180>;
-
-def FeatureTRACEV8_4 : SubtargetFeature<
- "tracev8.4", "HasTRACEV8_4", "true",
- "Enable v8.4-A Trace extension (FEAT_TRF)">;
-
-def FeatureAM : SubtargetFeature<
- "am", "HasAM", "true",
- "Enable v8.4-A Activity Monitors extension (FEAT_AMUv1)">;
-
-def FeatureAMVS : SubtargetFeature<
- "amvs", "HasAMVS", "true",
- "Enable v8.6-A Activity Monitors Virtualization support (FEAT_AMUv1p1)",
- [FeatureAM]>;
-
-def FeatureSEL2 : SubtargetFeature<
- "sel2", "HasSEL2", "true",
- "Enable v8.4-A Secure Exception Level 2 extension (FEAT_SEL2)">;
-
-def FeatureTLB_RMI : SubtargetFeature<
- "tlb-rmi", "HasTLB_RMI", "true",
- "Enable v8.4-A TLB Range and Maintenance Instructions (FEAT_TLBIOS, FEAT_TLBIRANGE)">;
-
-def FeatureFlagM : Extension<
- "flagm", "FlagM",
- "Enable v8.4-A Flag Manipulation Instructions (FEAT_FlagM)", [],
- "FEAT_FLAGM", "+flagm", 20>;
-
-// 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset
-def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true",
- "Enable v8.4-A RCPC instructions with Immediate Offsets (FEAT_LRCPC2)",
- [FeatureRCPC]>;
-
def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
"NegativeImmediates", "false",
"Convert immediates and instructions "
@@ -518,186 +703,11 @@ def FeatureAggressiveFMA :
"true",
"Enable Aggressive FMA for floating-point.">;
-def FeatureAltFPCmp : SubtargetFeature<"altnzcv", "HasAlternativeNZCV", "true",
- "Enable alternative NZCV format for floating point comparisons (FEAT_FlagM2)">;
-
-def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true",
- "Enable FRInt[32|64][Z|X] instructions that round a floating-point number to "
- "an integer (in FP format) forcing it to fit into a 32- or 64-bit int (FEAT_FRINTTS)" >;
-
-def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict",
- "true", "Enable architectural speculation restriction (FEAT_CSV2_2)">;
-
-def FeatureSB : Extension<"sb", "SB",
- "Enable v8.5 Speculation Barrier (FEAT_SB)", [],
- "FEAT_SB", "+sb", 470>;
-
-def FeatureSSBS : Extension<"ssbs", "SSBS",
- "Enable Speculative Store Bypass Safe bit (FEAT_SSBS, FEAT_SSBS2)", [],
- "FEAT_SSBS", "", 490>;
-
-def FeaturePredRes : Extension<"predres", "PredRes",
- "Enable v8.5a execution and data prediction invalidation instructions (FEAT_SPECRES)", [],
- "FEAT_PREDRES", "+predres", 480>;
-
-def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "CCDP", "true",
- "Enable v8.5 Cache Clean to Point of Deep Persistence (FEAT_DPB2)" >;
-
-let ArchExtKindSpelling = "AEK_NONE" in
-def FeatureBranchTargetId : Extension<"bti", "BTI",
- "Enable Branch Target Identification (FEAT_BTI)", [],
- "FEAT_BTI", "+bti", 510>;
-
-let ArchExtKindSpelling = "AEK_RAND", MArchName = "rng" in
-def FeatureRandGen : Extension<"rand", "RandGen",
- "Enable Random Number generation instructions (FEAT_RNG)", [],
- "FEAT_RNG", "+rand", 10>;
-
-// NOTE: "memtag" means FEAT_MTE + FEAT_MTE2 for -march or
-// __attribute((target(...))), but only FEAT_MTE for FMV.
-let MArchName = "memtag" in
-def FeatureMTE : Extension<"mte", "MTE",
- "Enable Memory Tagging Extension (FEAT_MTE, FEAT_MTE2)", [],
- "FEAT_MEMTAG", "", 440>;
-
-def FeatureTRBE : SubtargetFeature<"trbe", "TRBE", "true",
- "Enable Trace Buffer Extension (FEAT_TRBE)">;
-
-def FeatureETE : SubtargetFeature<"ete", "ETE", "true",
- "Enable Embedded Trace Extension (FEAT_ETE)",
- [FeatureTRBE]>;
-
-def FeatureTME : Extension<"tme", "TME",
- "Enable Transactional Memory Extension (FEAT_TME)" >;
-
def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
"AllowTaggedGlobals",
"true", "Use an instruction sequence for taking the address of a global "
"that allows a memory tag in the upper address bits">;
-let ArchExtKindSpelling = "AEK_I8MM" in
-def FeatureMatMulInt8 : Extension<"i8mm", "MatMulInt8",
- "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)", [],
- "FEAT_I8MM", "+i8mm", 270>;
-
-let ArchExtKindSpelling = "AEK_F32MM" in
-def FeatureMatMulFP32 : Extension<"f32mm", "MatMulFP32",
- "Enable Matrix Multiply FP32 Extension (FEAT_F32MM)", [FeatureSVE],
- "FEAT_SVE_F32MM", "+sve,+f32mm,+fullfp16,+fp-armv8,+neon", 350>;
-
-let ArchExtKindSpelling = "AEK_F64MM" in
-def FeatureMatMulFP64 : Extension<"f64mm", "MatMulFP64",
- "Enable Matrix Multiply FP64 Extension (FEAT_F64MM)", [FeatureSVE],
- "FEAT_SVE_F64MM", "+sve,+f64mm,+fullfp16,+fp-armv8,+neon", 360>;
-
-def FeatureXS : SubtargetFeature<"xs", "HasXS",
- "true", "Enable Armv8.7-A limited-TLB-maintenance instruction (FEAT_XS)">;
-
-def FeatureWFxT : Extension<"wfxt", "WFxT",
- "Enable Armv8.7-A WFET and WFIT instruction (FEAT_WFxT)", [],
- "FEAT_WFXT", "+wfxt", 550>;
-
-def FeatureHCX : SubtargetFeature<
- "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register (FEAT_HCX)">;
-
-def FeatureLS64 : Extension<"ls64", "LS64",
- "Enable Armv8.7-A LD64B/ST64B Accelerator Extension (FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA)", [],
- "FEAT_LS64", "", 520>;
-
-def FeatureHBC : Extension<"hbc", "HBC",
- "Enable Armv8.8-A Hinted Conditional Branches Extension (FEAT_HBC)">;
-
-def FeatureMOPS : Extension<"mops", "MOPS",
- "Enable Armv8.8-A memcpy and memset acceleration instructions (FEAT_MOPS)", [],
- "FEAT_MOPS", "+mops", 650>;
-
-def FeatureNMI : SubtargetFeature<"nmi", "HasNMI",
- "true", "Enable Armv8.8-A Non-maskable Interrupts (FEAT_NMI, FEAT_GICv3_NMI)">;
-
-def FeatureBRBE : Extension<"brbe", "BRBE",
- "Enable Branch Record Buffer Extension (FEAT_BRBE)">;
-
-def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF",
- "true", "Enable extra register in the Statistical Profiling Extension (FEAT_SPEv1p2)">;
-
-def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
- "true", "Enable fine grained virtualization traps extension (FEAT_FGT)">;
-
-def FeatureEnhancedCounterVirtualization :
- SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
- "true", "Enable enhanced counter virtualization extension (FEAT_ECV)">;
-
-def FeatureRME : SubtargetFeature<"rme", "HasRME",
- "true", "Enable Realm Management Extension (FEAT_RME)">;
-
-def FeatureSME : Extension<"sme", "SME",
- "Enable Scalable Matrix Extension (SME) (FEAT_SME)", [FeatureBF16, FeatureUseScalarIncVL],
- "FEAT_SME", "+sme,+bf16", 430>;
-
-def FeatureSMEF64F64 : Extension<"sme-f64f64", "SMEF64F64",
- "Enable Scalable Matrix Extension (SME) F64F64 instructions (FEAT_SME_F64F64)", [FeatureSME],
- "FEAT_SME_F64", "+sme,+sme-f64f64,+bf16", 560>;
-
-def FeatureSMEI16I64 : Extension<"sme-i16i64", "SMEI16I64",
- "Enable Scalable Matrix Extension (SME) I16I64 instructions (FEAT_SME_I16I64)", [FeatureSME],
- "FEAT_SME_I64", "+sme,+sme-i16i64,+bf16", 570>;
-
-def FeatureSMEFA64 : Extension<"sme-fa64", "SMEFA64",
- "Enable the full A64 instruction set in streaming SVE mode (FEAT_SME_FA64)", [FeatureSME, FeatureSVE2]>;
-
-def FeatureSME2 : Extension<"sme2", "SME2",
- "Enable Scalable Matrix Extension 2 (SME2) instructions", [FeatureSME],
- "FEAT_SME2", "+sme2,+sme,+bf16", 580>;
-
-let FMVDependencies = "+sme2,+sme-f16f16" in
-def FeatureSMEF16F16 : Extension<"sme-f16f16", "SMEF16F16",
- "Enable SME non-widening Float16 instructions (FEAT_SME_F16F16)", [FeatureSME2]>;
-
-let FMVDependencies = "+sme2p1,+sme2,+sme,+bf16" in
-def FeatureSME2p1 : Extension<"sme2p1", "SME2p1",
- "Enable Scalable Matrix Extension 2.1 (FEAT_SME2p1) instructions", [FeatureSME2]>;
-
-def FeatureFAMINMAX: Extension<"faminmax", "FAMINMAX",
- "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">;
-
-def FeatureLUT: Extension<"lut", "LUT",
- "Enable Lookup Table instructions (FEAT_LUT)">;
-
-def FeatureFP8 : Extension<"fp8", "FP8",
- "Enable FP8 instructions (FEAT_FP8)", [FeatureFAMINMAX, FeatureLUT, FeatureBF16]>;
-
-def FeatureFP8FMA : Extension<"fp8fma", "FP8FMA",
- "Enable fp8 multiply-add instructions (FEAT_FP8FMA)", [FeatureFP8]>;
-
-let FMVDependencies = "+sme2" in
-def FeatureSSVE_FP8FMA : Extension<"ssve-fp8fma", "SSVE_FP8FMA",
- "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2, FeatureFP8]>;
-
-def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4",
- "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)", [FeatureFP8FMA]>;
-
-def FeatureFP8DOT2: Extension<"fp8dot2", "FP8DOT2",
- "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)", [FeatureFP8DOT4]>;
-
-let FMVDependencies = "+sme2" in
-def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4",
- "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSSVE_FP8FMA]>;
-
-let FMVDependencies = "+sme2" in
-def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2",
- "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSSVE_FP8DOT4]>;
-
-def FeatureSME_LUTv2 : Extension<"sme-lutv2", "SME_LUTv2",
- "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">;
-
-let FMVDependencies = "+sme2,+fp8" in
-def FeatureSMEF8F32 : Extension<"sme-f8f32", "SMEF8F32",
- "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>;
-
-let FMVDependencies = "+fp8,+sme2" in
-def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16",
- "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSMEF8F32]>;
-
def FeatureAppleA7SysReg : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true",
"Apple A7 (the CPU formerly known as Cyclone)">;
@@ -707,9 +717,6 @@ def FeatureEL2VMSA : SubtargetFeature<"el2vmsa", "HasEL2VMSA", "true",
def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true",
"Enable Exception Level 3">;
-def FeatureCSSC : Extension<"cssc", "CSSC",
- "Enable Common Short Sequence Compression (CSSC) instructions (FEAT_CSSC)">;
-
def FeatureFixCortexA53_835769 : SubtargetFeature<"fix-cortex-a53-835769",
"FixCortexA53_835769", "true", "Mitigate Cortex-A53 Erratum 835769">;
@@ -718,49 +725,6 @@ def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice",
"Don't place a BTI instruction "
"after a return-twice">;
-def FeatureCHK : SubtargetFeature<"chk", "HasCHK",
- "true", "Enable Armv8.0-A Check Feature Status Extension (FEAT_CHK)">;
-
-def FeatureGCS : Extension<"gcs", "GCS",
- "Enable Armv9.4-A Guarded Call Stack Extension", [FeatureCHK]>;
-
-def FeatureCLRBHB : SubtargetFeature<"clrbhb", "HasCLRBHB",
- "true", "Enable Clear BHB instruction (FEAT_CLRBHB)">;
-
-def FeaturePRFM_SLC : SubtargetFeature<"prfm-slc-target", "HasPRFM_SLC",
- "true", "Enable SLC target for PRFM instruction">;
-
-let MArchName = "predres2" in
-def FeatureSPECRES2 : Extension<"specres2", "SPECRES2",
- "Enable Speculation Restriction Instruction (FEAT_SPECRES2)",
- [FeaturePredRes]>;
-
-def FeatureMEC : SubtargetFeature<"mec", "HasMEC",
- "true", "Enable Memory Encryption Contexts Extension", [FeatureRME]>;
-
-def FeatureITE : Extension<"ite", "ITE",
- "Enable Armv9.4-A Instrumentation Extension FEAT_ITE", [FeatureETE,
- FeatureTRBE]>;
-
-def FeatureRCPC3 : Extension<"rcpc3", "RCPC3",
- "Enable Armv8.9-A RCPC instructions for A64 and Advanced SIMD and floating-point instruction set (FEAT_LRCPC3)",
- [FeatureRCPC_IMMO],
- "FEAT_RCPC3", "+rcpc,+rcpc3", 241>;
-
-def FeatureTHE : Extension<"the", "THE",
- "Enable Armv8.9-A Translation Hardening Extension (FEAT_THE)">;
-
-def FeatureLSE128 : Extension<"lse128", "LSE128",
- "Enable Armv9.4-A 128-bit Atomic Instructions (FEAT_LSE128)",
- [FeatureLSE]>;
-
-// FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, and FEAT_SYSINSTR128 are mutually implicit.
-// Therefore group them all under a single feature flag, d128:
-def FeatureD128 : Extension<"d128", "D128",
- "Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers "
- "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)",
- [FeatureLSE128]>;
-
def FeatureDisableLdp : SubtargetFeature<"disable-ldp", "HasDisableLdp",
"true", "Do not emit ldp">;
@@ -773,18 +737,6 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO
def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
"true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
-// AArch64 2023 Architecture Extensions (v9.5-A)
-
-def FeatureCPA : Extension<"cpa", "CPA",
- "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
-
-def FeaturePAuthLR : Extension<"pauth-lr", "PAuthLR",
- "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
-
-def FeatureTLBIW : Extension<"tlbiw", "TLBIW",
- "Enable ARMv9.5-A TLBI VMALL for Dirty State (FEAT_TLBIW)">;
-
-
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -847,7 +799,7 @@ def HasV8_9aOps : Architecture64<8, 9, "a", "v8.9a",
!listconcat(HasV8_8aOps.DefaultExts, [FeatureSPECRES2, FeatureCSSC,
FeatureRASv2])>;
def HasV9_0aOps : Architecture64<9, 0, "a", "v9a",
- [HasV8_5aOps, FeatureMEC, FeatureSVE2],
+ [HasV8_5aOps, FeatureMEC],
!listconcat(HasV8_5aOps.DefaultExts, [FeatureFullFP16, FeatureSVE,
FeatureSVE2])>;
def HasV9_1aOps : Architecture64<9, 1, "a", "v9.1a",
@@ -914,4 +866,4 @@ def FeatureHardenSlsNoComdat : SubtargetFeature<"harden-sls-nocomdat",
// Only intended to be used by disassemblers.
def FeatureAll
- : SubtargetFeature<"all", "IsAll", "true", "Enable all instructions", []>;
+ : SubtargetFeature<"all", "IsAll", "true", "Enable all instructions">;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index cd532671f501..8216fa7db822 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
- if (AFI->hasSwiftAsyncContext())
+ if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
return false;
// If there are an odd number of GPRs before LR and FP in the CSRs list,
@@ -431,8 +431,16 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
uint64_t NumBytes = AFI->getLocalStackSize();
+ // If neither NEON or SVE are available, a COPY from one Q-reg to
+ // another requires a spill -> reload sequence. We can do that
+ // using a pre-decrementing store/post-decrementing load, but
+ // if we do so, we can't use the Red Zone.
+ bool LowerQRegCopyThroughMem = Subtarget.hasFPARMv8() &&
+ !Subtarget.isNeonAvailable() &&
+ !Subtarget.hasSVE();
+
return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
- getSVEStackSize(MF));
+ getSVEStackSize(MF) || LowerQRegCopyThroughMem);
}
/// hasFP - Return true if the specified function should have a dedicated frame
@@ -550,6 +558,10 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ SMEAttrs Attrs(MF.getFunction());
+ bool LocallyStreaming =
+ Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface();
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
if (CSI.empty())
@@ -561,14 +573,22 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
DebugLoc DL = MBB.findDebugLoc(MBBI);
for (const auto &Info : CSI) {
- if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+ unsigned FrameIdx = Info.getFrameIdx();
+ if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
continue;
assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
- unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
+ int64_t DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
+ int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
+
+ // The location of VG will be emitted before each streaming-mode change in
+ // the function. Only locally-streaming functions require emitting the
+ // non-streaming VG location here.
+ if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) ||
+ (!LocallyStreaming &&
+ DwarfReg == TRI.getDwarfRegNum(AArch64::VG, true)))
+ continue;
- int64_t Offset =
- MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -691,6 +711,9 @@ static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
continue;
+ if (!Info.isRestored())
+ continue;
+
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1013,7 +1036,10 @@ static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
MachineFunction *MF = MBB->getParent();
// If MBB is an entry block, use X9 as the scratch register
- if (&MF->front() == MBB)
+ // preserve_none functions may be using X9 to pass arguments,
+ // so prefer to pick an available register below.
+ if (&MF->front() == MBB &&
+ MF->getFunction().getCallingConv() != CallingConv::PreserveNone)
return AArch64::X9;
const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
@@ -1334,6 +1360,32 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
}
+bool requiresGetVGCall(MachineFunction &MF) {
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return AFI->hasStreamingModeChanges() &&
+ !MF.getSubtarget<AArch64Subtarget>().hasSVE();
+}
+
+bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
+ unsigned Opc = MBBI->getOpcode();
+ if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
+ Opc == AArch64::UBFMXri)
+ return true;
+
+ if (requiresGetVGCall(*MBBI->getMF())) {
+ if (Opc == AArch64::ORRXrr)
+ return true;
+
+ if (Opc == AArch64::BL) {
+ auto Op1 = MBBI->getOperand(0);
+ return Op1.isSymbol() &&
+ (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
+ }
+ }
+
+ return false;
+}
+
// Convert callee-save register save/restore instruction to do stack pointer
// decrement/increment to allocate/deallocate the callee-save stack area by
// converting store/load to use pre/post increment version.
@@ -1344,6 +1396,17 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
int CFAOffset = 0) {
unsigned NewOpc;
+
+ // If the function contains streaming mode changes, we expect instructions
+ // to calculate the value of VG before spilling. For locally-streaming
+ // functions, we need to do this for both the streaming and non-streaming
+ // vector length. Move past these instructions if necessary.
+ MachineFunction &MF = *MBB.getParent();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (AFI->hasStreamingModeChanges())
+ while (isVGInstruction(MBBI))
+ ++MBBI;
+
switch (MBBI->getOpcode()) {
default:
llvm_unreachable("Unexpected callee-save save/restore opcode!");
@@ -1400,7 +1463,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// If the first store isn't right where we want SP then we can't fold the
// update in so create a normal arithmetic instruction instead.
- MachineFunction &MF = *MBB.getParent();
if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
@@ -1652,6 +1714,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.removeReg(AArch64::X19);
LiveRegs.removeReg(AArch64::FP);
LiveRegs.removeReg(AArch64::LR);
+
+ // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
+ // This is necessary to spill VG if required where SVE is unavailable, but
+ // X0 is preserved around this call.
+ if (requiresGetVGCall(MF))
+ LiveRegs.removeReg(AArch64::X0);
}
auto VerifyClobberOnExit = make_scope_exit([&]() {
@@ -1838,6 +1906,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// pointer bump above.
while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
!IsSVECalleeSave(MBBI)) {
+ // Move past instructions generated to calculate VG
+ if (AFI->hasStreamingModeChanges())
+ while (isVGInstruction(MBBI))
+ ++MBBI;
+
if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
NeedsWinCFI, &HasWinCFI);
@@ -2760,7 +2833,7 @@ struct RegPairInfo {
unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
- enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
+ enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
RegPairInfo() = default;
@@ -2772,6 +2845,7 @@ struct RegPairInfo {
return 2;
case GPR:
case FPR64:
+ case VG:
return 8;
case ZPR:
case FPR128:
@@ -2847,6 +2921,8 @@ static void computeCalleeSaveRegisterPairs(
RPI.Type = RegPairInfo::ZPR;
else if (AArch64::PPRRegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::PPR;
+ else if (RPI.Reg1 == AArch64::VG)
+ RPI.Type = RegPairInfo::VG;
else
llvm_unreachable("Unsupported register class.");
@@ -2879,6 +2955,8 @@ static void computeCalleeSaveRegisterPairs(
if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
RPI.Reg2 = NextReg;
break;
+ case RegPairInfo::VG:
+ break;
}
}
@@ -2995,6 +3073,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool NeedsWinCFI = needsWinCFI(MF);
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
@@ -3062,7 +3141,70 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ StrOpc = AArch64::STRXui;
+ Size = 8;
+ Alignment = Align(8);
+ break;
+ }
+
+ unsigned X0Scratch = AArch64::NoRegister;
+ if (Reg1 == AArch64::VG) {
+ // Find an available register to store value of VG to.
+ Reg1 = findScratchNonCalleeSaveRegister(&MBB);
+ assert(Reg1 != AArch64::NoRegister);
+ SMEAttrs Attrs(MF.getFunction());
+
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
+ AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) {
+ // For locally-streaming functions, we need to store both the streaming
+ // & non-streaming VG. Spill the streaming value first.
+ BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1)
+ .addImm(1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1)
+ .addReg(Reg1)
+ .addImm(3)
+ .addImm(63)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ AFI->setStreamingVGIdx(RPI.FrameIdx);
+ } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+ BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
+ .addImm(31)
+ .addImm(1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ AFI->setVGIdx(RPI.FrameIdx);
+ } else {
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ if (llvm::any_of(
+ MBB.liveins(),
+ [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+ return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+ AArch64::X0, LiveIn.PhysReg);
+ }))
+ X0Scratch = Reg1;
+
+ if (X0Scratch != AArch64::NoRegister)
+ BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X0, RegState::Undef)
+ .addReg(AArch64::X0, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ const uint32_t *RegMask = TRI->getCallPreservedMask(
+ MF,
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
+ BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
+ .addExternalSymbol("__arm_get_current_vg")
+ .addRegMask(RegMask)
+ .addReg(AArch64::X0, RegState::ImplicitDefine)
+ .setMIFlag(MachineInstr::FrameSetup);
+ Reg1 = AArch64::X0;
+ AFI->setVGIdx(RPI.FrameIdx);
+ }
}
+
LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
dbgs() << ") -> fi#(" << RPI.FrameIdx;
@@ -3154,6 +3296,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
if (RPI.isPaired())
MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
}
+
+ if (X0Scratch != AArch64::NoRegister)
+ BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0)
+ .addReg(AArch64::XZR)
+ .addReg(X0Scratch, RegState::Undef)
+ .addReg(X0Scratch, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
}
return true;
}
@@ -3233,6 +3382,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ continue;
}
LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -3432,6 +3583,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
CSStackSize += RegSize;
}
+ // Increase the callee-saved stack size if the function has streaming mode
+ // changes, as we will need to spill the value of the VG register.
+ // For locally streaming functions, we spill both the streaming and
+ // non-streaming VG value.
+ const Function &F = MF.getFunction();
+ SMEAttrs Attrs(F);
+ if (AFI->hasStreamingModeChanges()) {
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
+ CSStackSize += 16;
+ else
+ CSStackSize += 8;
+ }
+
// Save number of saved regs, so we can easily update CSStackSize later.
unsigned NumSavedRegs = SavedRegs.count();
@@ -3568,6 +3732,33 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
}
+ // Insert VG into the list of CSRs, immediately before LR if saved.
+ if (AFI->hasStreamingModeChanges()) {
+ std::vector<CalleeSavedInfo> VGSaves;
+ SMEAttrs Attrs(MF.getFunction());
+
+ auto VGInfo = CalleeSavedInfo(AArch64::VG);
+ VGInfo.setRestored(false);
+ VGSaves.push_back(VGInfo);
+
+ // Add VG again if the function is locally-streaming, as we will spill two
+ // values.
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
+ VGSaves.push_back(VGInfo);
+
+ bool InsertBeforeLR = false;
+
+ for (unsigned I = 0; I < CSI.size(); I++)
+ if (CSI[I].getReg() == AArch64::LR) {
+ InsertBeforeLR = true;
+ CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end());
+ break;
+ }
+
+ if (!InsertBeforeLR)
+ CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end());
+ }
+
for (auto &CS : CSI) {
Register Reg = CS.getReg();
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
@@ -4183,12 +4374,58 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
}
} // namespace
+MachineBasicBlock::iterator emitVGSaveRestore(MachineBasicBlock::iterator II,
+ const AArch64FrameLowering *TFI) {
+ MachineInstr &MI = *II;
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+
+ if (MI.getOpcode() != AArch64::VGSavePseudo &&
+ MI.getOpcode() != AArch64::VGRestorePseudo)
+ return II;
+
+ SMEAttrs FuncAttrs(MF->getFunction());
+ bool LocallyStreaming =
+ FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
+ const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ const AArch64InstrInfo *TII =
+ MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+ int64_t VGFrameIdx =
+ LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();
+ assert(VGFrameIdx != std::numeric_limits<int>::max() &&
+ "Expected FrameIdx for VG");
+
+ unsigned CFIIndex;
+ if (MI.getOpcode() == AArch64::VGSavePseudo) {
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ int64_t Offset =
+ MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea();
+ CFIIndex = MF->addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, TRI->getDwarfRegNum(AArch64::VG, true), Offset));
+ } else
+ CFIIndex = MF->addFrameInst(MCCFIInstruction::createRestore(
+ nullptr, TRI->getDwarfRegNum(AArch64::VG, true)));
+
+ MachineInstr *UnwindInst = BuildMI(*MBB, II, II->getDebugLoc(),
+ TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ MI.eraseFromParent();
+ return UnwindInst->getIterator();
+}
+
void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
MachineFunction &MF, RegScavenger *RS = nullptr) const {
- if (StackTaggingMergeSetTag)
- for (auto &BB : MF)
- for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ for (auto &BB : MF)
+ for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
+ if (AFI->hasStreamingModeChanges())
+ II = emitVGSaveRestore(II, this);
+ if (StackTaggingMergeSetTag)
II = tryMergeAdjacentSTG(II, this, RS);
+ }
}
/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
diff --git a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index b87421e5ee46..82066b48c84b 100644
--- a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -137,7 +137,9 @@ bool AArch64GenRegisterBankInfo::checkValueMapImpl(unsigned Idx,
unsigned Offset) {
unsigned PartialMapBaseIdx = Idx - PartialMappingIdx::PMI_Min;
const ValueMapping &Map =
- AArch64GenRegisterBankInfo::getValueMapping((PartialMappingIdx)FirstInBank, Size)[Offset];
+ AArch64GenRegisterBankInfo::getValueMapping(
+ (PartialMappingIdx)FirstInBank,
+ TypeSize::getFixed(Size))[Offset];
return Map.BreakDown == &PartMappings[PartialMapBaseIdx] &&
Map.NumBreakDowns == 1;
}
@@ -167,7 +169,7 @@ bool AArch64GenRegisterBankInfo::checkPartialMappingIdx(
}
unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
- unsigned Size) {
+ TypeSize Size) {
if (RBIdx == PMI_FirstGPR) {
if (Size <= 32)
return 0;
@@ -178,17 +180,20 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
return -1;
}
if (RBIdx == PMI_FirstFPR) {
- if (Size <= 16)
+ const unsigned MinSize = Size.getKnownMinValue();
+ assert((!Size.isScalable() || MinSize >= 128) &&
+ "Scalable vector types should have size of at least 128 bits");
+ if (MinSize <= 16)
return 0;
- if (Size <= 32)
+ if (MinSize <= 32)
return 1;
- if (Size <= 64)
+ if (MinSize <= 64)
return 2;
- if (Size <= 128)
+ if (MinSize <= 128)
return 3;
- if (Size <= 256)
+ if (MinSize <= 256)
return 4;
- if (Size <= 512)
+ if (MinSize <= 512)
return 5;
return -1;
}
@@ -197,7 +202,7 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
const RegisterBankInfo::ValueMapping *
AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx,
- unsigned Size) {
+ const TypeSize Size) {
assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that");
unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size);
if (BaseIdxOffset == -1u)
@@ -221,7 +226,8 @@ const AArch64GenRegisterBankInfo::PartialMappingIdx
const RegisterBankInfo::ValueMapping *
AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID,
- unsigned SrcBankID, unsigned Size) {
+ unsigned SrcBankID,
+ const TypeSize Size) {
assert(DstBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
assert(SrcBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
PartialMappingIdx DstRBIdx = BankIDToCopyMapIdx[DstBankID];
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 248778f98f4c..544eec3ab9ce 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -395,7 +395,8 @@ public:
template <unsigned MaxIdx, unsigned Scale>
void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg,
unsigned Op);
-
+ void SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs, unsigned Op,
+ unsigned MaxIdx, unsigned Scale);
bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
/// SVE Reg+Imm addressing mode.
template <int64_t Min, int64_t Max>
@@ -2003,6 +2004,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode *N, unsigned NumVecs,
CurDAG->RemoveDeadNode(N);
}
+void AArch64DAGToDAGISel::SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs,
+ unsigned Op, unsigned MaxIdx,
+ unsigned Scale) {
+
+ SDValue SliceBase = N->getOperand(3);
+ SDValue Base, Offset;
+ if (!SelectSMETileSlice(SliceBase, MaxIdx, Base, Offset, Scale))
+ return;
+ // The correct Za tile number is computed in Machine Instruction
+ // See EmitZAInstr
+ // DAG cannot select Za tile as an output register with ZReg
+ SDLoc DL(N);
+ SDValue Ops[] = {/*TileNum*/ N->getOperand(2), Base, Offset,
+ /*Chain*/ N->getOperand(0)};
+ SDNode *Mov = CurDAG->getMachineNode(Op, DL, {MVT::Untyped, MVT::Other}, Ops);
+
+ EVT VT = N->getValueType(0);
+ for (unsigned I = 0; I < NumVecs; ++I)
+ ReplaceUses(SDValue(N, I),
+ CurDAG->getTargetExtractSubreg(AArch64::zsub0 + I, DL, VT,
+ SDValue(Mov, 0)));
+
+ // Copy chain
+ unsigned ChainIdx = NumVecs;
+ ReplaceUses(SDValue(N, ChainIdx), SDValue(Mov, 1));
+ CurDAG->RemoveDeadNode(N);
+}
+
void AArch64DAGToDAGISel::SelectUnaryMultiIntrinsic(SDNode *N,
unsigned NumOutVecs,
bool IsTupleInput,
@@ -4359,7 +4388,9 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
// N1 = SRL_PRED true, V, splat(imm) --> rotr amount
// N0 = SHL_PRED true, V, splat(bits-imm)
// V = (xor x, y)
- if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
+ if (VT.isScalableVector() &&
+ (Subtarget->hasSVE2() ||
+ (Subtarget->hasSME() && Subtarget->isStreaming()))) {
if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
N1.getOpcode() != AArch64ISD::SRL_PRED)
std::swap(N0, N1);
@@ -5243,6 +5274,74 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::MOVA_VG4_4ZMXI);
return;
}
+ case Intrinsic::aarch64_sme_readz_horiz_x2: {
+ if (VT == MVT::nxv16i8) {
+ SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_B_PSEUDO, 14, 2);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_H_PSEUDO, 6, 2);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_S_PSEUDO, 2, 2);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_D_PSEUDO, 0, 2);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_sme_readz_vert_x2: {
+ if (VT == MVT::nxv16i8) {
+ SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_B_PSEUDO, 14, 2);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_H_PSEUDO, 6, 2);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_S_PSEUDO, 2, 2);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_D_PSEUDO, 0, 2);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_sme_readz_horiz_x4: {
+ if (VT == MVT::nxv16i8) {
+ SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_B_PSEUDO, 12, 4);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_H_PSEUDO, 4, 4);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_S_PSEUDO, 0, 4);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_D_PSEUDO, 0, 4);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_sme_readz_vert_x4: {
+ if (VT == MVT::nxv16i8) {
+ SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_B_PSEUDO, 12, 4);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_H_PSEUDO, 4, 4);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_S_PSEUDO, 0, 4);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_D_PSEUDO, 0, 4);
+ return;
+ }
+ break;
+ }
case Intrinsic::swift_async_context_addr: {
SDLoc DL(Node);
SDValue Chain = Node->getOperand(0);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 48bf648b0052..81132572e820 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -149,7 +149,7 @@ static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
// scalable vector types for all instruction, even if SVE is not yet supported
// with some instructions.
// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
-static cl::opt<bool> EnableSVEGISel(
+cl::opt<bool> EnableSVEGISel(
"aarch64-enable-gisel-sve", cl::Hidden,
cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
cl::init(false));
@@ -423,7 +423,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addQRType(MVT::v8bf16);
}
- if (Subtarget->hasSVEorSME()) {
+ if (Subtarget->isSVEorStreamingSVEAvailable()) {
// Add legal sve predicate types
addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
@@ -728,14 +728,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
}
- for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
- ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
- ISD::FTAN, ISD::FEXP, ISD::FEXP2,
- ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
- ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
- ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN,
- ISD::STRICT_FEXP, ISD::STRICT_FEXP2, ISD::STRICT_FLOG,
- ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
+ for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
+ ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
+ ISD::FTAN, ISD::FEXP, ISD::FEXP2,
+ ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
+ ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
+ ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN,
+ ISD::STRICT_FEXP, ISD::STRICT_FEXP2, ISD::STRICT_FLOG,
+ ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) {
setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::v4f16, Expand);
setOperationAction(Op, MVT::v8f16, Expand);
@@ -1408,7 +1408,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// FIXME: Move lowering for more nodes here if those are common between
// SVE and SME.
- if (Subtarget->hasSVEorSME()) {
+ if (Subtarget->isSVEorStreamingSVEAvailable()) {
for (auto VT :
{MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
@@ -1418,7 +1418,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
- if (Subtarget->hasSVEorSME()) {
+ if (Subtarget->isSVEorStreamingSVEAvailable()) {
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
setOperationAction(ISD::BITREVERSE, VT, Custom);
setOperationAction(ISD::BSWAP, VT, Custom);
@@ -1430,8 +1430,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
- setOperationAction(ISD::MGATHER, VT, Custom);
- setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::MULHS, VT, Custom);
@@ -1486,7 +1484,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
if (!Subtarget->isLittleEndian())
setOperationAction(ISD::BITCAST, VT, Expand);
- if (Subtarget->hasSVE2orSME())
+ if (Subtarget->hasSVE2() ||
+ (Subtarget->hasSME() && Subtarget->isStreaming()))
// For SLI/SRI.
setOperationAction(ISD::OR, VT, Custom);
}
@@ -1528,14 +1527,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
- // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
- for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
- MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
- MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+ // NEON doesn't support masked loads/stores, but SME and SVE do.
+ for (auto VT :
+ {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
+ MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+ MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
- setOperationAction(ISD::MGATHER, VT, Custom);
- setOperationAction(ISD::MSCATTER, VT, Custom);
}
// Firstly, exclude all scalable vector extending loads/truncating stores,
@@ -1576,8 +1574,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MVT::nxv4f32, MVT::nxv2f64}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
- setOperationAction(ISD::MGATHER, VT, Custom);
- setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
@@ -1611,8 +1607,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
- if (Subtarget->isSVEAvailable())
- setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
@@ -1650,8 +1644,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
- setOperationAction(ISD::MGATHER, VT, Custom);
- setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
@@ -1675,18 +1667,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v1i64, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
- if (Subtarget->isSVEAvailable()) {
- // NEON doesn't support across-vector reductions, but SVE does.
- for (auto VT :
- {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
- setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
- }
-
- // Histcnt is SVE2 only
- if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable())
- setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other,
- Custom);
-
// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.
if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -1762,6 +1742,31 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
}
+ // Handle operations that are only available in non-streaming SVE mode.
+ if (Subtarget->isSVEAvailable()) {
+ for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
+ MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
+ MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
+ MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
+ MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
+ MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
+ MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+
+ for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
+ MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
+ MVT::v2f32, MVT::v4f32, MVT::v2f64})
+ setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+
+ // Histcnt is SVE2 only
+ if (Subtarget->hasSVE2())
+ setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other,
+ Custom);
+ }
+
+
if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
// Only required for llvm.aarch64.mops.memset.tag
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
@@ -1781,6 +1786,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
IsStrictFPEnabled = true;
setMaxAtomicSizeInBitsSupported(128);
+ // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
+ // it, but it's just a wrapper around ldexp.
+ if (Subtarget->isTargetWindows()) {
+ for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+ if (isOperationExpand(Op, MVT::f32))
+ setOperationAction(Op, MVT::f32, Promote);
+ }
+
+ // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
+ // isn't legal.
+ for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+ if (isOperationExpand(Op, MVT::f16))
+ setOperationAction(Op, MVT::f16, Promote);
+
if (Subtarget->isWindowsArm64EC()) {
// FIXME: are there intrinsics we need to exclude from this?
for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
@@ -1933,7 +1952,7 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
}
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
- if (!Subtarget->hasSVEorSME())
+ if (!Subtarget->isSVEorStreamingSVEAvailable())
return true;
// We can only use the BRKB + CNTP sequence with legal predicate types. We can
@@ -2492,7 +2511,11 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((AArch64ISD::NodeType)Opcode) {
case AArch64ISD::FIRST_NUMBER:
break;
+ MAKE_CASE(AArch64ISD::ALLOCATE_ZA_BUFFER)
+ MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ)
MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
+ MAKE_CASE(AArch64ISD::VG_SAVE)
+ MAKE_CASE(AArch64ISD::VG_RESTORE)
MAKE_CASE(AArch64ISD::SMSTART)
MAKE_CASE(AArch64ISD::SMSTOP)
MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -2953,18 +2976,25 @@ MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
MachineBasicBlock *
AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
- MachineBasicBlock *BB, bool HasTile) const {
+ MachineBasicBlock *BB) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
unsigned StartIdx = 0;
+ bool HasTile = BaseReg != AArch64::ZA;
+ bool HasZPROut = HasTile && MI.getOperand(0).isReg();
+ if (HasZPROut) {
+ MIB.add(MI.getOperand(0)); // Output ZPR
+ ++StartIdx;
+ }
if (HasTile) {
- MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
- MIB.addReg(BaseReg + MI.getOperand(0).getImm());
- StartIdx = 1;
- } else
+ MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
+ RegState::Define); // Output ZA Tile
+ MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
+ StartIdx++;
+ } else {
MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
-
+ }
for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
MIB.add(MI.getOperand(I));
@@ -2989,6 +3019,80 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
+ if (TPIDR2.Uses > 0) {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ // Store the buffer pointer to the TPIDR2 stack object.
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+ .addReg(MI.getOperand(0).getReg())
+ .addFrameIndex(TPIDR2.FrameIndex)
+ .addImm(0);
+ // Set the reserved bytes (10-15) to zero
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
+ .addReg(AArch64::WZR)
+ .addFrameIndex(TPIDR2.FrameIndex)
+ .addImm(5);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
+ .addReg(AArch64::WZR)
+ .addFrameIndex(TPIDR2.FrameIndex)
+ .addImm(3);
+ } else
+ MFI.RemoveStackObject(TPIDR2.FrameIndex);
+
+ BB->remove_instr(&MI);
+ return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+ // TODO This function grows the stack with a subtraction, which doesn't work
+ // on Windows. Some refactoring to share the functionality in
+ // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
+ // supports SME
+ assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
+ "Lazy ZA save is not yet supported on Windows");
+
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
+
+ if (TPIDR2.Uses > 0) {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // The SUBXrs below won't always be emitted in a form that accepts SP
+ // directly
+ Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
+ .addReg(AArch64::SP);
+
+ // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
+ auto Size = MI.getOperand(1).getReg();
+ auto Dest = MI.getOperand(0).getReg();
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
+ .addReg(Size)
+ .addReg(Size)
+ .addReg(SP);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+ AArch64::SP)
+ .addReg(Dest);
+
+ // We have just allocated a variable sized object, tell this to PEI.
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+
+ BB->remove_instr(&MI);
+ return BB;
+}
+
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
@@ -2999,17 +3103,17 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
switch (SMEMatrixType) {
case (AArch64::SMEMatrixArray):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
case (AArch64::SMEMatrixTileB):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
case (AArch64::SMEMatrixTileH):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
case (AArch64::SMEMatrixTileS):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
case (AArch64::SMEMatrixTileD):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
case (AArch64::SMEMatrixTileQ):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
}
}
@@ -3019,7 +3123,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MI.dump();
#endif
llvm_unreachable("Unexpected instruction for custom inserter!");
-
+ case AArch64::InitTPIDR2Obj:
+ return EmitInitTPIDR2Object(MI, BB);
+ case AArch64::AllocateZABuffer:
+ return EmitAllocateZABuffer(MI, BB);
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
case TargetOpcode::STATEPOINT:
@@ -3434,6 +3541,12 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
}
Opcode = AArch64ISD::FCCMP;
+ } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
+ APInt Imm = Const->getAPIntValue();
+ if (Imm.isNegative() && Imm.sgt(-32)) {
+ Opcode = AArch64ISD::CCMN;
+ RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
+ }
} else if (RHS.getOpcode() == ISD::SUB) {
SDValue SubOp0 = RHS.getOperand(0);
if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
@@ -6410,6 +6523,10 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
if (LoadNode->getMemoryVT() != MVT::v4i8)
return SDValue();
+ // Avoid generating unaligned loads.
+ if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
+ return SDValue();
+
unsigned ExtType;
if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
ExtType = ISD::SIGN_EXTEND;
@@ -6901,7 +7018,7 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
// NEON-sized vectors can be emulated using SVE instructions.
if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
- return Subtarget->hasSVEorSME();
+ return Subtarget->isSVEorStreamingSVEAvailable();
// Ensure NEON MVTs only belong to a single register class.
if (VT.getFixedSizeInBits() <= 128)
@@ -7027,47 +7144,6 @@ AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
}
}
-
-unsigned
-AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
- SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- MachineFrameInfo &MFI = MF.getFrameInfo();
-
- // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
- SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
- DAG.getConstant(1, DL, MVT::i32));
- SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
- SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
- SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
- Chain = Buffer.getValue(1);
- MFI.CreateVariableSizedObject(Align(1), nullptr);
-
- // Allocate an additional TPIDR2 object on the stack (16 bytes)
- unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
-
- // Store the buffer pointer to the TPIDR2 stack object.
- MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
- SDValue Ptr = DAG.getFrameIndex(
- TPIDR2Obj,
- DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
- Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
-
- // Set the reserved bytes (10-15) to zero
- EVT PtrTy = Ptr.getValueType();
- SDValue ReservedPtr =
- DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
- Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
- MPI);
- ReservedPtr =
- DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
- Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
- MPI);
-
- return TPIDR2Obj;
-}
-
static bool isPassedInFPR(EVT VT) {
return VT.isFixedLengthVector() ||
(VT.isFloatingPoint() && !VT.isScalableVector());
@@ -7483,10 +7559,28 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
- // Conservatively assume the function requires the lazy-save mechanism.
+ // Create a 16 Byte TPIDR2 object. The dynamic buffer
+ // will be expanded and stored in the static object later using a pseudonode.
if (SMEAttrs(MF.getFunction()).hasZAState()) {
- unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
- FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
+ TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
+ SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getConstant(1, DL, MVT::i32));
+
+ SDValue Buffer;
+ if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
+ Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
+ DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
+ } else {
+ SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+ Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
+ DAG.getVTList(MVT::i64, MVT::Other),
+ {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+ Chain = DAG.getNode(
+ AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
+ {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
}
if (CallConv == CallingConv::PreserveNone) {
@@ -8172,9 +8266,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
if (RequiresLazySave) {
- unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
- MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
- SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
+ const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
+ SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
+ TPIDR2.FrameIndex,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
SDValue NumZaSaveSlicesAddr =
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
@@ -8514,6 +8610,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue InGlue;
if (RequiresSMChange) {
+
+ Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), Chain);
+ InGlue = Chain.getValue(1);
+
SDValue NewChain = changeStreamingMode(
DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
@@ -8691,6 +8792,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Result = changeStreamingMode(
DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
+ InGlue = Result.getValue(1);
+
+ Result =
+ DAG.getNode(AArch64ISD::VG_RESTORE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
}
if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
@@ -8707,7 +8813,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresLazySave) {
// Conditionally restore the lazy save using a pseudo node.
- unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
SDValue RegMask = DAG.getRegisterMask(
TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
@@ -8720,7 +8826,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// RESTORE_ZA pseudo.
SDValue Glue;
SDValue TPIDR2Block = DAG.getFrameIndex(
- FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+ TPIDR2.FrameIndex,
+ DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
Result =
DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
@@ -8732,6 +8839,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i64));
+ TPIDR2.Uses++;
}
if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
@@ -9735,9 +9843,7 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
EltSize *= 2;
NumElts /= 2;
MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
- Val = DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
- DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
+ Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
}
return Val;
@@ -14443,7 +14549,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
Op.getOperand(0), Op.getOperand(1));
case ISD::SRA:
case ISD::SRL:
- if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
+ if (VT.isScalableVector() &&
+ (Subtarget->hasSVE2() ||
+ (Subtarget->hasSME() && Subtarget->isStreaming()))) {
SDValue RShOperand;
unsigned ShiftValue;
if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
@@ -14995,55 +15103,13 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
return SDValue();
}
-// When x and y are extended, lower:
-// avgfloor(x, y) -> (x + y) >> 1
-// avgceil(x, y) -> (x + y + 1) >> 1
-
-// Otherwise, lower to:
-// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
-// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
unsigned NewOp) const {
if (Subtarget->hasSVE2())
return LowerToPredicatedOp(Op, DAG, NewOp);
- SDLoc dl(Op);
- SDValue OpA = Op->getOperand(0);
- SDValue OpB = Op->getOperand(1);
- EVT VT = Op.getValueType();
- bool IsCeil =
- (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
- bool IsSigned =
- (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
- unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
-
- assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
-
- auto IsZeroExtended = [&DAG](SDValue &Node) {
- KnownBits Known = DAG.computeKnownBits(Node, 0);
- return Known.Zero.isSignBitSet();
- };
-
- auto IsSignExtended = [&DAG](SDValue &Node) {
- return (DAG.ComputeNumSignBits(Node, 0) > 1);
- };
-
- SDValue ConstantOne = DAG.getConstant(1, dl, VT);
- if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
- (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
- SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
- if (IsCeil)
- Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
- return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
- }
-
- SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
- SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
-
- SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
- tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
- SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
- return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
+ // Default to expand.
+ return SDValue();
}
SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
@@ -15854,48 +15920,49 @@ bool AArch64TargetLowering::shouldSinkOperands(
return false;
}
-static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy,
- bool IsLittleEndian) {
- Value *Op = ZExt->getOperand(0);
- auto *SrcTy = cast<FixedVectorType>(Op->getType());
- auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
- auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
+static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
+ unsigned NumElts, bool IsLittleEndian,
+ SmallVectorImpl<int> &Mask) {
if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
return false;
assert(DstWidth % SrcWidth == 0 &&
- "TBL lowering is not supported for a ZExt instruction with this "
- "source & destination element type.");
- unsigned ZExtFactor = DstWidth / SrcWidth;
+ "TBL lowering is not supported for a conversion instruction with this "
+ "source and destination element type.");
+
+ unsigned Factor = DstWidth / SrcWidth;
+ unsigned MaskLen = NumElts * Factor;
+
+ Mask.clear();
+ Mask.resize(MaskLen, NumElts);
+
+ unsigned SrcIndex = 0;
+ for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
+ Mask[I] = SrcIndex++;
+
+ return true;
+}
+
+static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,
+ FixedVectorType *ZExtTy,
+ FixedVectorType *DstTy,
+ bool IsLittleEndian) {
+ auto *SrcTy = cast<FixedVectorType>(Op->getType());
unsigned NumElts = SrcTy->getNumElements();
- IRBuilder<> Builder(ZExt);
+ auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
+ auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
+
SmallVector<int> Mask;
- // Create a mask that selects <0,...,Op[i]> for each lane of the destination
- // vector to replace the original ZExt. This can later be lowered to a set of
- // tbl instructions.
- for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
- if (IsLittleEndian) {
- if (i % ZExtFactor == 0)
- Mask.push_back(i / ZExtFactor);
- else
- Mask.push_back(NumElts);
- } else {
- if ((i + 1) % ZExtFactor == 0)
- Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
- else
- Mask.push_back(NumElts);
- }
- }
+ if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
+ return nullptr;
auto *FirstEltZero = Builder.CreateInsertElement(
PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
Result = Builder.CreateBitCast(Result, DstTy);
- if (DstTy != ZExt->getType())
- Result = Builder.CreateZExt(Result, ZExt->getType());
- ZExt->replaceAllUsesWith(Result);
- ZExt->eraseFromParent();
- return true;
+ if (DstTy != ZExtTy)
+ Result = Builder.CreateZExt(Result, ZExtTy);
+ return Result;
}
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
@@ -16060,21 +16127,30 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
DstTy = TruncDstType;
}
-
- return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
+ IRBuilder<> Builder(ZExt);
+ Value *Result = createTblShuffleForZExt(
+ Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
+ DstTy, Subtarget->isLittleEndian());
+ if (!Result)
+ return false;
+ ZExt->replaceAllUsesWith(Result);
+ ZExt->eraseFromParent();
+ return true;
}
auto *UIToFP = dyn_cast<UIToFPInst>(I);
if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
DstTy->getElementType()->isFloatTy()) {
IRBuilder<> Builder(I);
- auto *ZExt = cast<ZExtInst>(
- Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
+ Value *ZExt = createTblShuffleForZExt(
+ Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
+ FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
+ if (!ZExt)
+ return false;
auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
I->replaceAllUsesWith(UI);
I->eraseFromParent();
- return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
- Subtarget->isLittleEndian());
+ return true;
}
// Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
@@ -16149,15 +16225,13 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
UseScalable = false;
- if (!VecTy->isScalableTy() && !Subtarget->isNeonAvailable() &&
- !Subtarget->useSVEForFixedLengthVectors())
- return false;
-
- if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
+ if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
+ (!Subtarget->useSVEForFixedLengthVectors() ||
+ !getSVEPredPatternFromNumElements(MinElts)))
return false;
- // Ensure that the predicate for this number of elements is available.
- if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
+ if (isa<ScalableVectorType>(VecTy) &&
+ !Subtarget->isSVEorStreamingSVEAvailable())
return false;
// Ensure the number of vector elements is greater than 1.
@@ -17720,6 +17794,47 @@ static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
}
+// Transform vector add(zext i8 to i32, zext i8 to i32)
+// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
+// This allows extra uses of saddl/uaddl at the lower vector widths, and less
+// extends.
+static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
+ (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+ N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
+ (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
+ N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
+ N->getOperand(0).getOperand(0).getValueType() !=
+ N->getOperand(1).getOperand(0).getValueType())
+ return SDValue();
+
+ if (N->getOpcode() == ISD::MUL &&
+ N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0).getOperand(0);
+ SDValue N1 = N->getOperand(1).getOperand(0);
+ EVT InVT = N0.getValueType();
+
+ EVT S1 = InVT.getScalarType();
+ EVT S2 = VT.getScalarType();
+ if ((S2 == MVT::i32 && S1 == MVT::i8) ||
+ (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
+ SDLoc DL(N);
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
+ S2.getHalfSizedIntegerVT(*DAG.getContext()),
+ VT.getVectorElementCount());
+ SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
+ SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
+ SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
+ return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
+ : (unsigned)ISD::SIGN_EXTEND,
+ DL, VT, NewOp);
+ }
+ return SDValue();
+}
+
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -17728,6 +17843,8 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
return Ext;
if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
return Ext;
+ if (SDValue Ext = performVectorExtCombine(N, DAG))
+ return Ext;
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -18124,9 +18241,11 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (!VT.isVector())
return SDValue();
- // The combining code works for NEON, SVE2 and SME.
- if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
- (VT.isScalableVector() && !Subtarget.hasSVE2()))
+ if (VT.isScalableVector() && !Subtarget.hasSVE2())
+ return SDValue();
+
+ if (VT.isFixedLengthVector() &&
+ (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
return SDValue();
SDValue N0 = N->getOperand(0);
@@ -19604,41 +19723,6 @@ static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
}
-// Transform vector add(zext i8 to i32, zext i8 to i32)
-// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
-// This allows extra uses of saddl/uaddl at the lower vector widths, and less
-// extends.
-static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
- if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
- (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
- N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
- (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
- N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
- N->getOperand(0).getOperand(0).getValueType() !=
- N->getOperand(1).getOperand(0).getValueType())
- return SDValue();
-
- SDValue N0 = N->getOperand(0).getOperand(0);
- SDValue N1 = N->getOperand(1).getOperand(0);
- EVT InVT = N0.getValueType();
-
- EVT S1 = InVT.getScalarType();
- EVT S2 = VT.getScalarType();
- if ((S2 == MVT::i32 && S1 == MVT::i8) ||
- (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
- SDLoc DL(N);
- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
- S2.getHalfSizedIntegerVT(*DAG.getContext()),
- VT.getVectorElementCount());
- SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
- SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
- SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
- return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
- }
- return SDValue();
-}
-
static SDValue performBuildVectorCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -20260,7 +20344,7 @@ static SDValue performAddSubCombine(SDNode *N,
return Val;
if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
return Val;
- if (SDValue Val = performVectorAddSubExtCombine(N, DCI.DAG))
+ if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
return Val;
if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG))
return Val;
@@ -22283,7 +22367,8 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
SmallVector<SDValue, 16> MaskConstants;
- if (VecVT == MVT::v16i8) {
+ if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
+ VecVT == MVT::v16i8) {
// v16i8 is a special case, as we have 16 entries but only 8 positional bits
// per entry. We split it into two halves, apply the mask, zip the halves to
// create 8x 16-bit values, and the perform the vector reduce.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 48a4ea91c278..5200b24d1388 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -70,6 +70,9 @@ enum NodeType : unsigned {
COALESCER_BARRIER,
+ VG_SAVE,
+ VG_RESTORE,
+
SMSTART,
SMSTOP,
RESTORE_ZA,
@@ -454,6 +457,8 @@ enum NodeType : unsigned {
// SME
RDSVL,
REVD_MERGE_PASSTHRU,
+ ALLOCATE_ZA_BUFFER,
+ INIT_TPIDR2OBJ,
// Asserts that a function argument (i32) is zero-extended to i8 by
// the caller
@@ -650,11 +655,14 @@ public:
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg,
- MachineInstr &MI, MachineBasicBlock *BB,
- bool HasTile) const;
+ MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
unsigned Opcode, bool Op0IsDef) const;
MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
@@ -1034,9 +1042,6 @@ private:
bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override;
- unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
- SelectionDAG &DAG) const;
-
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 1f437d0ed6f8..e1ecc5a57dd2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7602,13 +7602,12 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
}
let mayRaiseFPException = 1, Uses = [FPCR] in
-multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
- Predicate pred = HasNEON> {
- let Predicates = [pred] in {
+multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
+ let Predicates = [HasNEONandIsStreamingSafe] in {
def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
}
- let Predicates = [pred, HasFullFP16] in {
+ let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
}
}
@@ -7616,11 +7615,13 @@ multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
+ let Predicates = [HasNEONandIsStreamingSafe] in {
def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
[(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
[(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
- let Predicates = [HasNEON, HasFullFP16] in {
+ }
+ let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
[(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>;
}
@@ -7880,7 +7881,7 @@ class SIMDMovAlias<string asm, string size, Instruction inst,
multiclass SMov {
// SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
// streaming mode.
- let Predicates = [HasNEONorSME] in {
+ let Predicates = [HasNEONandIsStreamingSafe] in {
def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> {
let Inst{20-16} = 0b00001;
}
@@ -7927,7 +7928,7 @@ multiclass SMov {
multiclass UMov {
// UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
// streaming mode.
- let Predicates = [HasNEONorSME] in {
+ let Predicates = [HasNEONandIsStreamingSafe] in {
def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> {
let Inst{20-16} = 0b00001;
}
@@ -11887,79 +11888,79 @@ multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel,
// complex DAG for DstRHS.
let Predicates = [HasLSE] in
multiclass LDOPregister_patterns_ord_dag<string inst, string suffix, string op,
- string size, dag SrcRHS, dag DstRHS> {
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS),
+ ValueType vt, dag SrcRHS, dag DstRHS> {
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_monotonic") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # suffix) DstRHS, GPR64sp:$Rn)>;
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS),
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_acquire") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "A" # suffix) DstRHS, GPR64sp:$Rn)>;
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS),
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_release") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "L" # suffix) DstRHS, GPR64sp:$Rn)>;
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS),
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_acq_rel") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS),
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_seq_cst") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
}
multiclass LDOPregister_patterns_ord<string inst, string suffix, string op,
- string size, dag RHS> {
- defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, RHS, RHS>;
+ ValueType vt, dag RHS> {
+ defm : LDOPregister_patterns_ord_dag<inst, suffix, op, vt, RHS, RHS>;
}
multiclass LDOPregister_patterns_ord_mod<string inst, string suffix, string op,
- string size, dag LHS, dag RHS> {
- defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, LHS, RHS>;
+ ValueType vt, dag LHS, dag RHS> {
+ defm : LDOPregister_patterns_ord_dag<inst, suffix, op, vt, LHS, RHS>;
}
multiclass LDOPregister_patterns<string inst, string op> {
- defm : LDOPregister_patterns_ord<inst, "X", op, "64", (i64 GPR64:$Rm)>;
- defm : LDOPregister_patterns_ord<inst, "W", op, "32", (i32 GPR32:$Rm)>;
- defm : LDOPregister_patterns_ord<inst, "H", op, "16", (i32 GPR32:$Rm)>;
- defm : LDOPregister_patterns_ord<inst, "B", op, "8", (i32 GPR32:$Rm)>;
+ defm : LDOPregister_patterns_ord<inst, "X", op, i64, (i64 GPR64:$Rm)>;
+ defm : LDOPregister_patterns_ord<inst, "W", op, i32, (i32 GPR32:$Rm)>;
+ defm : LDOPregister_patterns_ord<inst, "H", op, i16, (i32 GPR32:$Rm)>;
+ defm : LDOPregister_patterns_ord<inst, "B", op, i8, (i32 GPR32:$Rm)>;
}
multiclass LDOPregister_patterns_mod<string inst, string op, string mod> {
- defm : LDOPregister_patterns_ord_mod<inst, "X", op, "64",
+ defm : LDOPregister_patterns_ord_mod<inst, "X", op, i64,
(i64 GPR64:$Rm),
(i64 (!cast<Instruction>(mod#Xrr) XZR, GPR64:$Rm))>;
- defm : LDOPregister_patterns_ord_mod<inst, "W", op, "32",
+ defm : LDOPregister_patterns_ord_mod<inst, "W", op, i32,
(i32 GPR32:$Rm),
(i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>;
- defm : LDOPregister_patterns_ord_mod<inst, "H", op, "16",
+ defm : LDOPregister_patterns_ord_mod<inst, "H", op, i16,
(i32 GPR32:$Rm),
(i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>;
- defm : LDOPregister_patterns_ord_mod<inst, "B", op, "8",
+ defm : LDOPregister_patterns_ord_mod<inst, "B", op, i8,
(i32 GPR32:$Rm),
(i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>;
}
let Predicates = [HasLSE] in
multiclass CASregister_patterns_ord_dag<string inst, string suffix, string op,
- string size, dag OLD, dag NEW> {
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW),
+ ValueType vt, dag OLD, dag NEW> {
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_monotonic") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # suffix) OLD, NEW, GPR64sp:$Rn)>;
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW),
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_acquire") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "A" # suffix) OLD, NEW, GPR64sp:$Rn)>;
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW),
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_release") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "L" # suffix) OLD, NEW, GPR64sp:$Rn)>;
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW),
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_acq_rel") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
- def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW),
+ def : Pat<(!cast<PatFrag>(op#"_"#vt#"_seq_cst") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
}
multiclass CASregister_patterns_ord<string inst, string suffix, string op,
- string size, dag OLD, dag NEW> {
- defm : CASregister_patterns_ord_dag<inst, suffix, op, size, OLD, NEW>;
+ ValueType vt, dag OLD, dag NEW> {
+ defm : CASregister_patterns_ord_dag<inst, suffix, op, vt, OLD, NEW>;
}
multiclass CASregister_patterns<string inst, string op> {
- defm : CASregister_patterns_ord<inst, "X", op, "64",
+ defm : CASregister_patterns_ord<inst, "X", op, i64,
(i64 GPR64:$Rold), (i64 GPR64:$Rnew)>;
- defm : CASregister_patterns_ord<inst, "W", op, "32",
+ defm : CASregister_patterns_ord<inst, "W", op, i32,
(i32 GPR32:$Rold), (i32 GPR32:$Rnew)>;
- defm : CASregister_patterns_ord<inst, "H", op, "16",
+ defm : CASregister_patterns_ord<inst, "H", op, i16,
(i32 GPR32:$Rold), (i32 GPR32:$Rnew)>;
- defm : CASregister_patterns_ord<inst, "B", op, "8",
+ defm : CASregister_patterns_ord<inst, "B", op, i8,
(i32 GPR32:$Rold), (i32 GPR32:$Rnew)>;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 58ca52f37b63..2d2b2bee99ec 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -346,16 +346,16 @@ let Predicates = [HasNEON] in {
}
let Predicates = [HasNoLSE] in {
-def : Pat<(atomic_cmp_swap_8 GPR64:$addr, GPR32:$desired, GPR32:$new),
+def : Pat<(atomic_cmp_swap_i8 GPR64:$addr, GPR32:$desired, GPR32:$new),
(CMP_SWAP_8 GPR64:$addr, GPR32:$desired, GPR32:$new)>;
-def : Pat<(atomic_cmp_swap_16 GPR64:$addr, GPR32:$desired, GPR32:$new),
+def : Pat<(atomic_cmp_swap_i16 GPR64:$addr, GPR32:$desired, GPR32:$new),
(CMP_SWAP_16 GPR64:$addr, GPR32:$desired, GPR32:$new)>;
-def : Pat<(atomic_cmp_swap_32 GPR64:$addr, GPR32:$desired, GPR32:$new),
+def : Pat<(atomic_cmp_swap_i32 GPR64:$addr, GPR32:$desired, GPR32:$new),
(CMP_SWAP_32 GPR64:$addr, GPR32:$desired, GPR32:$new)>;
-def : Pat<(atomic_cmp_swap_64 GPR64:$addr, GPR64:$desired, GPR64:$new),
+def : Pat<(atomic_cmp_swap_i64 GPR64:$addr, GPR64:$desired, GPR64:$new),
(CMP_SWAP_64 GPR64:$addr, GPR64:$desired, GPR64:$new)>;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7d540efe2b41..ee397db3fba6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -17,6 +17,7 @@
#include "AArch64PointerAuth.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
@@ -1354,48 +1355,52 @@ static bool areCFlagsAccessedBetweenInstrs(
return false;
}
-/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
-/// operation which could set the flags in an identical manner
-bool AArch64InstrInfo::optimizePTestInstr(
- MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
- const MachineRegisterInfo *MRI) const {
- auto *Mask = MRI->getUniqueVRegDef(MaskReg);
- auto *Pred = MRI->getUniqueVRegDef(PredReg);
- auto NewOp = Pred->getOpcode();
- bool OpChanged = false;
-
+std::optional<unsigned>
+AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
+ MachineInstr *Pred,
+ const MachineRegisterInfo *MRI) const {
unsigned MaskOpcode = Mask->getOpcode();
unsigned PredOpcode = Pred->getOpcode();
bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
bool PredIsWhileLike = isWhileOpcode(PredOpcode);
- if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) &&
- getElementSizeForOpcode(MaskOpcode) ==
- getElementSizeForOpcode(PredOpcode) &&
- Mask->getOperand(1).getImm() == 31) {
+ if (PredIsWhileLike) {
+ // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
+ // instruction and the condition is "any" since WHILcc does an implicit
+ // PTEST(ALL, PG) check and PG is always a subset of ALL.
+ if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
+ return PredOpcode;
+
// For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
// redundant since WHILE performs an implicit PTEST with an all active
- // mask. Must be an all active predicate of matching element size.
+ // mask.
+ if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
+ getElementSizeForOpcode(MaskOpcode) ==
+ getElementSizeForOpcode(PredOpcode))
+ return PredOpcode;
+
+ return {};
+ }
+
+ if (PredIsPTestLike) {
+ // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
+ // instruction that sets the flags as PTEST would and the condition is
+ // "any" since PG is always a subset of the governing predicate of the
+ // ptest-like instruction.
+ if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
+ return PredOpcode;
// For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
- // PTEST_LIKE instruction uses the same all active mask and the element
- // size matches. If the PTEST has a condition of any then it is always
- // redundant.
- if (PredIsPTestLike) {
+ // the element size matches and either the PTEST_LIKE instruction uses
+ // the same all active mask or the condition is "any".
+ if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
+ getElementSizeForOpcode(MaskOpcode) ==
+ getElementSizeForOpcode(PredOpcode)) {
auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
- if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY)
- return false;
+ if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
+ return PredOpcode;
}
- // Fallthough to simply remove the PTEST.
- } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) &&
- PTest->getOpcode() == AArch64::PTEST_PP_ANY) {
- // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
- // instruction that sets the flags as PTEST would. This is only valid when
- // the condition is any.
-
- // Fallthough to simply remove the PTEST.
- } else if (PredIsPTestLike) {
// For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
// flags are set based on the same mask 'PG', but PTEST_LIKE must operate
// on 8-bit predicates like the PTEST. Otherwise, for instructions like
@@ -1420,56 +1425,67 @@ bool AArch64InstrInfo::optimizePTestInstr(
// identical regardless of element size.
auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
- if ((Mask != PTestLikeMask) ||
- (PredElementSize != AArch64::ElementSizeB &&
- PTest->getOpcode() != AArch64::PTEST_PP_ANY))
- return false;
+ if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
+ PTest->getOpcode() == AArch64::PTEST_PP_ANY))
+ return PredOpcode;
- // Fallthough to simply remove the PTEST.
- } else {
- // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
- // opcode so the PTEST becomes redundant.
- switch (PredOpcode) {
- case AArch64::AND_PPzPP:
- case AArch64::BIC_PPzPP:
- case AArch64::EOR_PPzPP:
- case AArch64::NAND_PPzPP:
- case AArch64::NOR_PPzPP:
- case AArch64::ORN_PPzPP:
- case AArch64::ORR_PPzPP:
- case AArch64::BRKA_PPzP:
- case AArch64::BRKPA_PPzPP:
- case AArch64::BRKB_PPzP:
- case AArch64::BRKPB_PPzPP:
- case AArch64::RDFFR_PPz: {
- // Check to see if our mask is the same. If not the resulting flag bits
- // may be different and we can't remove the ptest.
- auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
- if (Mask != PredMask)
- return false;
- break;
- }
- case AArch64::BRKN_PPzP: {
- // BRKN uses an all active implicit mask to set flags unlike the other
- // flag-setting instructions.
- // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
- if ((MaskOpcode != AArch64::PTRUE_B) ||
- (Mask->getOperand(1).getImm() != 31))
- return false;
- break;
- }
- case AArch64::PTRUE_B:
- // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
- break;
- default:
- // Bail out if we don't recognize the input
- return false;
- }
+ return {};
+ }
- NewOp = convertToFlagSettingOpc(PredOpcode);
- OpChanged = true;
+ // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
+ // opcode so the PTEST becomes redundant.
+ switch (PredOpcode) {
+ case AArch64::AND_PPzPP:
+ case AArch64::BIC_PPzPP:
+ case AArch64::EOR_PPzPP:
+ case AArch64::NAND_PPzPP:
+ case AArch64::NOR_PPzPP:
+ case AArch64::ORN_PPzPP:
+ case AArch64::ORR_PPzPP:
+ case AArch64::BRKA_PPzP:
+ case AArch64::BRKPA_PPzPP:
+ case AArch64::BRKB_PPzP:
+ case AArch64::BRKPB_PPzPP:
+ case AArch64::RDFFR_PPz: {
+ // Check to see if our mask is the same. If not the resulting flag bits
+ // may be different and we can't remove the ptest.
+ auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+ if (Mask != PredMask)
+ return {};
+ break;
+ }
+ case AArch64::BRKN_PPzP: {
+ // BRKN uses an all active implicit mask to set flags unlike the other
+ // flag-setting instructions.
+ // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
+ if ((MaskOpcode != AArch64::PTRUE_B) ||
+ (Mask->getOperand(1).getImm() != 31))
+ return {};
+ break;
+ }
+ case AArch64::PTRUE_B:
+ // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
+ break;
+ default:
+ // Bail out if we don't recognize the input
+ return {};
}
+ return convertToFlagSettingOpc(PredOpcode);
+}
+
+/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
+/// operation which could set the flags in an identical manner
+bool AArch64InstrInfo::optimizePTestInstr(
+ MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
+ const MachineRegisterInfo *MRI) const {
+ auto *Mask = MRI->getUniqueVRegDef(MaskReg);
+ auto *Pred = MRI->getUniqueVRegDef(PredReg);
+ unsigned PredOpcode = Pred->getOpcode();
+ auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
+ if (!NewOp)
+ return false;
+
const TargetRegisterInfo *TRI = &getRegisterInfo();
// If another instruction between Pred and PTest accesses flags, don't remove
@@ -1481,9 +1497,9 @@ bool AArch64InstrInfo::optimizePTestInstr(
// as they are prior to PTEST. Sometimes this requires the tested PTEST
// operand to be replaced with an equivalent instruction that also sets the
// flags.
- Pred->setDesc(get(NewOp));
PTest->eraseFromParent();
- if (OpChanged) {
+ if (*NewOp != PredOpcode) {
+ Pred->setDesc(get(*NewOp));
bool succeeded = UpdateOperandRegClass(*Pred);
(void)succeeded;
assert(succeeded && "Operands have incompatible register classes!");
@@ -4481,7 +4497,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copy a Predicate register by ORRing with itself.
if (AArch64::PPRRegClass.contains(DestReg) &&
AArch64::PPRRegClass.contains(SrcReg)) {
- assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected SVE register.");
BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
.addReg(SrcReg) // Pg
.addReg(SrcReg)
@@ -4494,8 +4511,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
if (DestIsPNR || SrcIsPNR) {
- assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
- "Unexpected predicate-as-counter register.");
auto ToPPR = [](MCRegister R) -> MCRegister {
return (R - AArch64::PN0) + AArch64::P0;
};
@@ -4516,7 +4531,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copy a Z register by ORRing with itself.
if (AArch64::ZPRRegClass.contains(DestReg) &&
AArch64::ZPRRegClass.contains(SrcReg)) {
- assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected SVE register.");
BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
.addReg(SrcReg)
.addReg(SrcReg, getKillRegState(KillSrc));
@@ -4528,7 +4544,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
(AArch64::ZPR2RegClass.contains(SrcReg) ||
AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
- assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected SVE register.");
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
Indices);
@@ -4538,7 +4555,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copy a Z register triple by copying the individual sub-registers.
if (AArch64::ZPR3RegClass.contains(DestReg) &&
AArch64::ZPR3RegClass.contains(SrcReg)) {
- assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected SVE register.");
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
AArch64::zsub2};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
@@ -4551,7 +4569,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
(AArch64::ZPR4RegClass.contains(SrcReg) ||
AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
- assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected SVE register.");
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
AArch64::zsub2, AArch64::zsub3};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
@@ -4656,7 +4675,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR128RegClass.contains(DestReg) &&
AArch64::FPR128RegClass.contains(SrcReg)) {
- if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable())
+ if (Subtarget.isSVEorStreamingSVEAvailable() &&
+ !Subtarget.isNeonAvailable())
BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
.addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
.addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
@@ -4814,14 +4834,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
Opc = AArch64::STRBui;
break;
case 2: {
- bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
if (AArch64::FPR16RegClass.hasSubClassEq(RC))
Opc = AArch64::STRHui;
- else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
+ AArch64::PPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
- assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
- "Unexpected register store without SVE2p1 or SME2");
Opc = AArch64::STR_PXI;
StackID = TargetStackID::ScalableVector;
}
@@ -4870,7 +4888,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
AArch64::sube64, AArch64::subo64, FI, MMO);
return;
} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZXI;
StackID = TargetStackID::ScalableVector;
@@ -4894,7 +4912,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
Offset = false;
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZXI;
StackID = TargetStackID::ScalableVector;
@@ -4906,7 +4924,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
Opc = AArch64::ST1Threev2d;
Offset = false;
} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZZXI;
StackID = TargetStackID::ScalableVector;
@@ -4919,7 +4937,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
Offset = false;
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZZZXI;
StackID = TargetStackID::ScalableVector;
@@ -4992,10 +5010,8 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
if (AArch64::FPR16RegClass.hasSubClassEq(RC))
Opc = AArch64::LDRHui;
else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
- assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
- "Unexpected register load without SVE2p1 or SME2");
if (IsPNR)
PNRReg = DestReg;
Opc = AArch64::LDR_PXI;
@@ -5046,7 +5062,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
AArch64::subo64, FI, MMO);
return;
} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZXI;
StackID = TargetStackID::ScalableVector;
@@ -5070,7 +5086,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Offset = false;
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZXI;
StackID = TargetStackID::ScalableVector;
@@ -5082,7 +5098,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Opc = AArch64::LD1Threev2d;
Offset = false;
} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZZXI;
StackID = TargetStackID::ScalableVector;
@@ -5095,7 +5111,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Offset = false;
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
- assert(Subtarget.hasSVEorSME() &&
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZZZXI;
StackID = TargetStackID::ScalableVector;
@@ -8555,6 +8571,8 @@ AArch64InstrInfo::getOutliningCandidateInfo(
NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
RepeatedSequenceLocs = CandidatesWithoutStackFixups;
FrameID = MachineOutlinerNoLRSave;
+ if (RepeatedSequenceLocs.size() < 2)
+ return std::nullopt;
} else {
SetCandidateCallInfo(MachineOutlinerDefault, 12);
@@ -8700,6 +8718,13 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
if (!AFI || AFI->hasRedZone().value_or(true))
return false;
+ // FIXME: Determine whether it is safe to outline from functions which contain
+ // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
+ // outlined together and ensure it is safe to outline with async unwind info,
+ // required for saving & restoring VG around calls.
+ if (AFI->hasStreamingModeChanges())
+ return false;
+
// FIXME: Teach the outliner to generate/handle Windows unwind info.
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
return false;
@@ -9582,18 +9607,49 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
namespace {
class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
- MachineInstr *PredBranch;
+ MachineFunction *MF;
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineRegisterInfo &MRI;
+
+ /// The block of the loop
+ MachineBasicBlock *LoopBB;
+ /// The conditional branch of the loop
+ MachineInstr *CondBranch;
+ /// The compare instruction for loop control
+ MachineInstr *Comp;
+ /// The number of the operand of the loop counter value in Comp
+ unsigned CompCounterOprNum;
+ /// The instruction that updates the loop counter value
+ MachineInstr *Update;
+ /// The number of the operand of the loop counter value in Update
+ unsigned UpdateCounterOprNum;
+ /// The initial value of the loop counter
+ Register Init;
+ /// True iff Update is a predecessor of Comp
+ bool IsUpdatePriorComp;
+
+ /// The normalized condition used by createTripCountGreaterCondition()
SmallVector<MachineOperand, 4> Cond;
public:
- AArch64PipelinerLoopInfo(MachineInstr *PredBranch,
+ AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
+ MachineInstr *Comp, unsigned CompCounterOprNum,
+ MachineInstr *Update, unsigned UpdateCounterOprNum,
+ Register Init, bool IsUpdatePriorComp,
const SmallVectorImpl<MachineOperand> &Cond)
- : PredBranch(PredBranch), Cond(Cond.begin(), Cond.end()) {}
+ : MF(Comp->getParent()->getParent()),
+ TII(MF->getSubtarget().getInstrInfo()),
+ TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
+ LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
+ CompCounterOprNum(CompCounterOprNum), Update(Update),
+ UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
+ IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
// Make the instructions for loop control be placed in stage 0.
- // The predecessors of PredBranch are considered by the caller.
- return MI == PredBranch;
+ // The predecessors of Comp are considered by the caller.
+ return MI == Comp;
}
std::optional<bool> createTripCountGreaterCondition(
@@ -9606,31 +9662,277 @@ public:
return {};
}
+ void createRemainingIterationsGreaterCondition(
+ int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+ DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
+
void setPreheader(MachineBasicBlock *NewPreheader) override {}
void adjustTripCount(int TripCountAdjust) override {}
void disposed() override {}
+ bool isMVEExpanderSupported() override { return true; }
};
} // namespace
-static bool isCompareAndBranch(unsigned Opcode) {
- switch (Opcode) {
- case AArch64::CBZW:
- case AArch64::CBZX:
- case AArch64::CBNZW:
- case AArch64::CBNZX:
- case AArch64::TBZW:
- case AArch64::TBZX:
- case AArch64::TBNZW:
- case AArch64::TBNZX:
- return true;
+/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
+/// is replaced by ReplaceReg. The output register is newly created.
+/// The other operands are unchanged from MI.
+static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
+ Register ReplaceReg, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertTo) {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+ const TargetRegisterInfo *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
+ Register Result = 0;
+ for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
+ if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
+ Result = MRI.createVirtualRegister(
+ MRI.getRegClass(NewMI->getOperand(0).getReg()));
+ NewMI->getOperand(I).setReg(Result);
+ } else if (I == ReplaceOprNum) {
+ MRI.constrainRegClass(
+ ReplaceReg,
+ TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
+ NewMI->getOperand(I).setReg(ReplaceReg);
+ }
}
- return false;
+ MBB.insert(InsertTo, NewMI);
+ return Result;
+}
+
+void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
+ int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+ DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
+ // Create and accumulate conditions for next TC iterations.
+ // Example:
+ // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
+ // # iteration of the kernel
+ //
+ // # insert the following instructions
+ // cond = CSINCXr 0, 0, C, implicit $nzcv
+ // counter = ADDXri counter, 1 # clone from this->Update
+ // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
+ // cond = CSINCXr cond, cond, C, implicit $nzcv
+ // ... (repeat TC times)
+ // SUBSXri cond, 0, implicit-def $nzcv
+
+ assert(CondBranch->getOpcode() == AArch64::Bcc);
+ // CondCode to exit the loop
+ AArch64CC::CondCode CC =
+ (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
+ if (CondBranch->getOperand(1).getMBB() == LoopBB)
+ CC = AArch64CC::getInvertedCondCode(CC);
+
+ // Accumulate conditions to exit the loop
+ Register AccCond = AArch64::XZR;
+
+ // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
+ auto AccumulateCond = [&](Register CurCond,
+ AArch64CC::CondCode CC) -> Register {
+ Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+ BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
+ .addReg(NewCond, RegState::Define)
+ .addReg(CurCond)
+ .addReg(CurCond)
+ .addImm(AArch64CC::getInvertedCondCode(CC));
+ return NewCond;
+ };
+
+ if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
+ // Update and Comp for I==0 are already exists in MBB
+ // (MBB is an unrolled kernel)
+ Register Counter;
+ for (int I = 0; I <= TC; ++I) {
+ Register NextCounter;
+ if (I != 0)
+ NextCounter =
+ cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
+
+ AccCond = AccumulateCond(AccCond, CC);
+
+ if (I != TC) {
+ if (I == 0) {
+ if (Update != Comp && IsUpdatePriorComp) {
+ Counter =
+ LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
+ NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
+ MBB.end());
+ } else {
+ // can use already calculated value
+ NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
+ }
+ } else if (Update != Comp) {
+ NextCounter =
+ cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
+ }
+ }
+ Counter = NextCounter;
+ }
+ } else {
+ Register Counter;
+ if (LastStage0Insts.empty()) {
+ // use initial counter value (testing if the trip count is sufficient to
+ // be executed by pipelined code)
+ Counter = Init;
+ if (IsUpdatePriorComp)
+ Counter =
+ cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
+ } else {
+ // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
+ Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
+ }
+
+ for (int I = 0; I <= TC; ++I) {
+ Register NextCounter;
+ NextCounter =
+ cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
+ AccCond = AccumulateCond(AccCond, CC);
+ if (I != TC && Update != Comp)
+ NextCounter =
+ cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
+ Counter = NextCounter;
+ }
+ }
+
+ // If AccCond == 0, the remainder is greater than TC.
+ BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
+ .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
+ .addReg(AccCond)
+ .addImm(0)
+ .addImm(0);
+ Cond.clear();
+ Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
+}
+
+static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
+ Register &RegMBB, Register &RegOther) {
+ assert(Phi.getNumOperands() == 5);
+ if (Phi.getOperand(2).getMBB() == MBB) {
+ RegMBB = Phi.getOperand(1).getReg();
+ RegOther = Phi.getOperand(3).getReg();
+ } else {
+ assert(Phi.getOperand(4).getMBB() == MBB);
+ RegMBB = Phi.getOperand(3).getReg();
+ RegOther = Phi.getOperand(1).getReg();
+ }
+}
+
+static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
+ if (!Reg.isVirtual())
+ return false;
+ const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ return MRI.getVRegDef(Reg)->getParent() != BB;
+}
+
+/// If Reg is an induction variable, return true and set some parameters
+static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
+ MachineInstr *&UpdateInst,
+ unsigned &UpdateCounterOprNum, Register &InitReg,
+ bool &IsUpdatePriorComp) {
+ // Example:
+ //
+ // Preheader:
+ // InitReg = ...
+ // LoopBB:
+ // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
+ // Reg = COPY Reg0 ; COPY is ignored.
+ // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
+ // ; Reg is the value calculated in the previous
+ // ; iteration, so IsUpdatePriorComp == false.
+
+ if (LoopBB->pred_size() != 2)
+ return false;
+ if (!Reg.isVirtual())
+ return false;
+ const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
+ UpdateInst = nullptr;
+ UpdateCounterOprNum = 0;
+ InitReg = 0;
+ IsUpdatePriorComp = true;
+ Register CurReg = Reg;
+ while (true) {
+ MachineInstr *Def = MRI.getVRegDef(CurReg);
+ if (Def->getParent() != LoopBB)
+ return false;
+ if (Def->isCopy()) {
+ // Ignore copy instructions unless they contain subregisters
+ if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
+ return false;
+ CurReg = Def->getOperand(1).getReg();
+ } else if (Def->isPHI()) {
+ if (InitReg != 0)
+ return false;
+ if (!UpdateInst)
+ IsUpdatePriorComp = false;
+ extractPhiReg(*Def, LoopBB, CurReg, InitReg);
+ } else {
+ if (UpdateInst)
+ return false;
+ switch (Def->getOpcode()) {
+ case AArch64::ADDSXri:
+ case AArch64::ADDSWri:
+ case AArch64::SUBSXri:
+ case AArch64::SUBSWri:
+ case AArch64::ADDXri:
+ case AArch64::ADDWri:
+ case AArch64::SUBXri:
+ case AArch64::SUBWri:
+ UpdateInst = Def;
+ UpdateCounterOprNum = 1;
+ break;
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSWrr:
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSWrr:
+ case AArch64::ADDXrr:
+ case AArch64::ADDWrr:
+ case AArch64::SUBXrr:
+ case AArch64::SUBWrr:
+ UpdateInst = Def;
+ if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
+ UpdateCounterOprNum = 1;
+ else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
+ UpdateCounterOprNum = 2;
+ else
+ return false;
+ break;
+ default:
+ return false;
+ }
+ CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
+ }
+
+ if (!CurReg.isVirtual())
+ return false;
+ if (Reg == CurReg)
+ break;
+ }
+
+ if (!UpdateInst)
+ return false;
+
+ return true;
}
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+ // Accept loops that meet the following conditions
+ // * The conditional branch is BCC
+ // * The compare instruction is ADDS/SUBS/WHILEXX
+ // * One operand of the compare is an induction variable and the other is a
+ // loop invariant value
+ // * The induction variable is incremented/decremented by a single instruction
+ // * Does not contain CALL or instructions which have unmodeled side effects
+
+ for (MachineInstr &MI : *LoopBB)
+ if (MI.isCall() || MI.hasUnmodeledSideEffects())
+ // This instruction may use NZCV, which interferes with the instruction to
+ // be inserted for loop control.
+ return nullptr;
+
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 4> Cond;
if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
@@ -9641,48 +9943,76 @@ AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
return nullptr;
// Must be conditional branch
- if (FBB == nullptr)
+ if (TBB != LoopBB && FBB == nullptr)
return nullptr;
assert((TBB == LoopBB || FBB == LoopBB) &&
"The Loop must be a single-basic-block loop");
+ MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+
+ if (CondBranch->getOpcode() != AArch64::Bcc)
+ return nullptr;
+
// Normalization for createTripCountGreaterCondition()
if (TBB == LoopBB)
reverseBranchCondition(Cond);
- MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
- const TargetRegisterInfo &TRI = getRegisterInfo();
-
- // Find the immediate predecessor of the conditional branch
- MachineInstr *PredBranch = nullptr;
- if (CondBranch->getOpcode() == AArch64::Bcc) {
- for (MachineInstr &MI : reverse(*LoopBB)) {
- if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
- PredBranch = &MI;
+ MachineInstr *Comp = nullptr;
+ unsigned CompCounterOprNum = 0;
+ for (MachineInstr &MI : reverse(*LoopBB)) {
+ if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
+ // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
+ // operands is a loop invariant value
+
+ switch (MI.getOpcode()) {
+ case AArch64::SUBSXri:
+ case AArch64::SUBSWri:
+ case AArch64::ADDSXri:
+ case AArch64::ADDSWri:
+ Comp = &MI;
+ CompCounterOprNum = 1;
break;
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSXrr:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXrr:
+ Comp = &MI;
+ break;
+ default:
+ if (isWhileOpcode(MI.getOpcode())) {
+ Comp = &MI;
+ break;
+ }
+ return nullptr;
}
- }
- if (!PredBranch)
- return nullptr;
- } else if (isCompareAndBranch(CondBranch->getOpcode())) {
- const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
- Register Reg = CondBranch->getOperand(0).getReg();
- if (!Reg.isVirtual())
- return nullptr;
- PredBranch = MRI.getVRegDef(Reg);
- // MachinePipeliner does not expect that the immediate predecessor is a Phi
- if (PredBranch->isPHI())
- return nullptr;
+ if (CompCounterOprNum == 0) {
+ if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
+ CompCounterOprNum = 2;
+ else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
+ CompCounterOprNum = 1;
+ else
+ return nullptr;
+ }
+ break;
+ }
+ }
+ if (!Comp)
+ return nullptr;
- if (PredBranch->getParent() != LoopBB)
- return nullptr;
- } else {
+ MachineInstr *Update = nullptr;
+ Register Init;
+ bool IsUpdatePriorComp;
+ unsigned UpdateCounterOprNum;
+ if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
+ Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
return nullptr;
- }
- return std::make_unique<AArch64PipelinerLoopInfo>(PredBranch, Cond);
+ return std::make_unique<AArch64PipelinerLoopInfo>(
+ LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
+ Init, IsUpdatePriorComp, Cond);
}
#define GET_INSTRINFO_HELPERS
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index f434799c3982..792e0c3063b1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -572,6 +572,9 @@ private:
bool optimizePTestInstr(MachineInstr *PTest, unsigned MaskReg,
unsigned PredReg,
const MachineRegisterInfo *MRI) const;
+ std::optional<unsigned>
+ canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
+ MachineInstr *Pred, const MachineRegisterInfo *MRI) const;
};
struct UsedNZCV {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 91e5bc3caa10..f3aac3b46d17 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -107,7 +107,7 @@ def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPC_IMMO()">,
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
-def HasNEON : Predicate<"Subtarget->hasNEON()">,
+def HasNEON : Predicate<"Subtarget->isNeonAvailable()">,
AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;
def HasSM4 : Predicate<"Subtarget->hasSM4()">,
AssemblerPredicateWithAll<(all_of FeatureSM4), "sm4">;
@@ -141,35 +141,41 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">,
def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
AssemblerPredicateWithAll<(all_of FeatureFuseAES),
"fuse-aes">;
-def HasSVE : Predicate<"Subtarget->hasSVE()">,
+def HasSVE : Predicate<"Subtarget->isSVEAvailable()">,
AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">;
-def HasSVE2 : Predicate<"Subtarget->hasSVE2()">,
+def HasSVE2 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;
-def HasSVE2p1 : Predicate<"Subtarget->hasSVE2p1()">,
+def HasSVE2p1 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">;
-def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">,
+def HasSVE2AES : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2AES()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">;
-def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">,
+def HasSVE2SM4 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SM4()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">;
-def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">,
+def HasSVE2SHA3 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">;
-def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
+def HasSVE2BitPerm : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2BitPerm()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
def HasB16B16 : Predicate<"Subtarget->hasB16B16()">,
AssemblerPredicateWithAll<(all_of FeatureB16B16), "b16b16">;
-def HasSME : Predicate<"Subtarget->hasSME()">,
+def HasSMEandIsNonStreamingSafe
+ : Predicate<"Subtarget->hasSME()">,
AssemblerPredicateWithAll<(all_of FeatureSME), "sme">;
-def HasSMEF64F64 : Predicate<"Subtarget->hasSMEF64F64()">,
+def HasSME : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME()">,
+ AssemblerPredicateWithAll<(all_of FeatureSME), "sme">;
+def HasSMEF64F64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF64F64()">,
AssemblerPredicateWithAll<(all_of FeatureSMEF64F64), "sme-f64f64">;
-def HasSMEF16F16 : Predicate<"Subtarget->hasSMEF16F16()">,
+def HasSMEF16F16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF16F16()">,
AssemblerPredicateWithAll<(all_of FeatureSMEF16F16), "sme-f16f16">;
-def HasSMEFA64 : Predicate<"Subtarget->hasSMEFA64()">,
+def HasSMEFA64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEFA64()">,
AssemblerPredicateWithAll<(all_of FeatureSMEFA64), "sme-fa64">;
-def HasSMEI16I64 : Predicate<"Subtarget->hasSMEI16I64()">,
+def HasSMEI16I64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEI16I64()">,
AssemblerPredicateWithAll<(all_of FeatureSMEI16I64), "sme-i16i64">;
-def HasSME2 : Predicate<"Subtarget->hasSME2()">,
+def HasSME2andIsNonStreamingSafe
+ : Predicate<"Subtarget->hasSME2()">,
+ AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">;
+def HasSME2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2()">,
AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">;
-def HasSME2p1 : Predicate<"Subtarget->hasSME2p1()">,
+def HasSME2p1 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p1()">,
AssemblerPredicateWithAll<(all_of FeatureSME2p1), "sme2p1">;
def HasFP8 : Predicate<"Subtarget->hasFP8()">,
AssemblerPredicateWithAll<(all_of FeatureFP8), "fp8">;
@@ -198,48 +204,47 @@ def HasSSVE_FP8DOT4 : Predicate<"Subtarget->hasSSVE_FP8DOT4() || "
"ssve-fp8dot4 or (sve2 and fp8dot4)">;
def HasLUT : Predicate<"Subtarget->hasLUT()">,
AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">;
-def HasSME_LUTv2 : Predicate<"Subtarget->hasSME_LUTv2()">,
+def HasSME_LUTv2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">,
AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">;
-def HasSMEF8F16 : Predicate<"Subtarget->hasSMEF8F16()">,
+def HasSMEF8F16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">,
AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">;
-def HasSMEF8F32 : Predicate<"Subtarget->hasSMEF8F32()">,
+def HasSMEF8F32 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">,
AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">;
// A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
// they should be enabled if either has been specified.
def HasSVEorSME
- : Predicate<"Subtarget->hasSVEorSME()">,
+ : Predicate<"Subtarget->hasSVE() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME),
"sve or sme">;
def HasSVE2orSME
- : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME()">,
+ : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME),
"sve2 or sme">;
def HasSVE2orSME2
- : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME2()">,
+ : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME2),
"sve2 or sme2">;
def HasSVE2p1_or_HasSME
- : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME()">,
+ : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">;
def HasSVE2p1_or_HasSME2
- : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME2()">,
+ : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
AssemblerPredicateWithAll<(any_of FeatureSME2, FeatureSVE2p1), "sme2 or sve2p1">;
def HasSVE2p1_or_HasSME2p1
- : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()">,
+ : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2p1())">,
AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">;
def HasSMEF16F16orSMEF8F16
- : Predicate<"Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16()">,
+ : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
"sme-f16f16 or sme-f8f16">;
// A subset of NEON instructions are legal in Streaming SVE execution mode,
-// they should be enabled if either has been specified.
-def HasNEONorSME
- : Predicate<"Subtarget->hasNEON() || Subtarget->hasSME()">,
- AssemblerPredicateWithAll<(any_of FeatureNEON, FeatureSME),
- "neon or sme">;
+// so don't need the additional check for 'isNeonAvailable'.
+def HasNEONandIsStreamingSafe
+ : Predicate<"Subtarget->hasNEON()">,
+ AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -323,8 +328,6 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
-def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">;
-
def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisInt<1>]>>;
@@ -1350,7 +1353,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
VectorIndexS:$idx)>;
}
-let Predicates = [HasNEONorSME, HasBF16] in {
+let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
// Round FP32 to BF16.
def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
@@ -5789,9 +5792,9 @@ defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>;
-defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>;
-defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>;
+defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONandIsStreamingSafe>;
+defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONandIsStreamingSafe>;
+defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONandIsStreamingSafe>;
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -5820,7 +5823,7 @@ let Predicates = [HasRDM] in {
defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
int_aarch64_neon_fmulx,
- [HasNEONorSME]>;
+ [HasNEONandIsStreamingSafe]>;
let Predicates = [HasNEON] in {
def : InstAlias<"cmls $dst, $src1, $src2",
@@ -5894,9 +5897,9 @@ defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEONorSME>;
-defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEONorSME>;
-defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEONorSME>;
+defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -5915,7 +5918,7 @@ def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
(CMLTv1i64rz V64:$Rn)>;
// Round FP64 to BF16.
-let Predicates = [HasNEONorSME, HasBF16] in
+let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in
def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))),
(BFCVT (FCVTXNv1i64 $Rn))>;
@@ -6016,7 +6019,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEON] in {
+let Predicates = [HasNEONandIsStreamingSafe] in {
def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
(SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -6026,7 +6029,7 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
(UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
-let Predicates = [HasFullFP16] in {
+let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
(SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
@@ -6118,7 +6121,7 @@ def : Pat <(f64 (uint_to_fp (i32
(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
// 64-bits -> double are handled in target specific dag combine:
// performIntToFpCombine.
-} // let Predicates = [HasNEON]
+} // let Predicates = [HasNEONandIsStreamingSafe]
//===----------------------------------------------------------------------===//
// Advanced SIMD three different-sized vector instructions.
@@ -8379,7 +8382,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
// Same as above, but the first element is populated using
// scalar_to_vector + insert_subvector instead of insert_vector_elt.
-let Predicates = [IsNeonAvailable] in {
+let Predicates = [HasNEON] in {
class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
SDPatternOperator ExtLoad, Instruction LD1>
: Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index c3d64f5a0a96..957d7bc79b18 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -196,12 +196,14 @@ bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo(
const MachineFunction &MF) const {
if (!NeedsAsyncDwarfUnwindInfo) {
const Function &F = MF.getFunction();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// The check got "minsize" is because epilogue unwind info is not emitted
// (yet) for homogeneous epilogues, outlined functions, and functions
// outlined from.
- NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo(MF) &&
- F.getUWTableKind() == UWTableKind::Async &&
- !F.hasMinSize();
+ NeedsAsyncDwarfUnwindInfo =
+ needsDwarfUnwindInfo(MF) &&
+ ((F.getUWTableKind() == UWTableKind::Async && !F.hasMinSize()) ||
+ AFI->hasStreamingModeChanges());
}
return *NeedsAsyncDwarfUnwindInfo;
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index df09fc5592ed..001521d1101e 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+#include "AArch64Subtarget.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -35,6 +36,11 @@ struct AArch64FunctionInfo;
class AArch64Subtarget;
class MachineInstr;
+struct TPIDR2Object {
+ int FrameIndex = std::numeric_limits<int>::max();
+ unsigned Uses = 0;
+};
+
/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
/// contains private AArch64-specific information for each MachineFunction.
class AArch64FunctionInfo final : public MachineFunctionInfo {
@@ -195,7 +201,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
bool IsSVECC = false;
/// The frame-index for the TPIDR2 object used for lazy saves.
- Register LazySaveTPIDR2Obj = 0;
+ TPIDR2Object TPIDR2;
/// Whether this function changes streaming mode within the function.
bool HasStreamingModeChanges = false;
@@ -216,6 +222,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
unsigned PredicateRegForFillSpill = 0;
+ // The stack slots where VG values are stored to.
+ int64_t VGIdx = std::numeric_limits<int>::max();
+ int64_t StreamingVGIdx = std::numeric_limits<int>::max();
+
public:
AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
@@ -234,11 +244,16 @@ public:
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
+ int64_t getVGIdx() const { return VGIdx; };
+ void setVGIdx(unsigned Idx) { VGIdx = Idx; };
+
+ int64_t getStreamingVGIdx() const { return StreamingVGIdx; };
+ void setStreamingVGIdx(unsigned FrameIdx) { StreamingVGIdx = FrameIdx; };
+
bool isSVECC() const { return IsSVECC; };
void setIsSVECC(bool s) { IsSVECC = s; };
- unsigned getLazySaveTPIDR2Obj() const { return LazySaveTPIDR2Obj; }
- void setLazySaveTPIDR2Obj(unsigned Reg) { LazySaveTPIDR2Obj = Reg; }
+ TPIDR2Object &getTPIDR2Obj() { return TPIDR2; }
void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index a759efcd9441..53b46ff42b72 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -189,6 +189,17 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720",
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
+def TuneA725 : SubtargetFeature<"cortex-a725", "ARMProcFamily",
+ "CortexA725",
+ "Cortex-A725 ARM processors", [
+ FeatureFuseAES,
+ FeaturePostRAScheduler,
+ FeatureCmpBccFusion,
+ FeatureALULSLFast,
+ FeatureFuseAdrpAdd,
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
+
def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
"CortexR82",
"Cortex-R82 ARM processors", [
@@ -238,6 +249,15 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
+def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
+ "CortexX925", "Cortex-X925 ARM processors",[
+ FeatureALULSLFast,
+ FeatureFuseAdrpAdd,
+ FeatureFuseAES,
+ FeaturePostRAScheduler,
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
+
def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
"Fujitsu A64FX processors", [
FeaturePostRAScheduler,
@@ -378,6 +398,22 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureZCRegMove,
FeatureZCZeroing]>;
+def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
+ "Apple M4", [
+ FeatureAlternateSExtLoadCVTF32Pattern,
+ FeatureArithmeticBccFusion,
+ FeatureArithmeticCbzFusion,
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFuseAddress,
+ FeatureFuseAES,
+ FeatureFuseArithmeticLogic,
+ FeatureFuseCCSelect,
+ FeatureFuseCryptoEOR,
+ FeatureFuseLiterals,
+ FeatureZCRegMove,
+ FeatureZCZeroing
+ ]>;
+
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
[FeatureExynosCheapAsMoveHandling,
@@ -620,7 +656,8 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
"Oryon",
"Nuvia Inc Oryon processors", [
- FeatureCrypto,
+ FeatureSHA2,
+ FeatureAES,
FeatureFPARMv8,
FeatureNEON,
FeatureFuseAES,
@@ -640,187 +677,240 @@ def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
HasV8_6aOps]>;
def ProcessorFeatures {
- list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
+ list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES,
FeatureFPARMv8, FeatureNEON, FeaturePerfMon];
- list<SubtargetFeature> A55 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> A55 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
FeatureRCPC, FeaturePerfMon];
list<SubtargetFeature> A510 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
FeatureMatMulInt8, FeatureBF16, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
- FeatureFP16FML];
+ FeatureFP16FML,
+ FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2];
list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
- FeatureFP16FML];
+ FeatureFP16FML,
+ FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
+ FeatureSVE, FeatureSVE2];
list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
- FeatureFP16FML];
- list<SubtargetFeature> A65 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ FeatureFP16FML,
+ FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
+ FeatureSVE, FeatureSVE2];
+ list<SubtargetFeature> A65 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
FeatureRCPC, FeatureSSBS, FeatureRAS,
FeaturePerfMon];
- list<SubtargetFeature> A76 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> A76 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
FeatureRCPC, FeatureSSBS, FeaturePerfMon];
- list<SubtargetFeature> A77 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> A77 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
FeatureRCPC, FeaturePerfMon, FeatureSSBS];
- list<SubtargetFeature> A78 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> A78 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
FeatureRCPC, FeaturePerfMon, FeatureSPE,
FeatureSSBS];
- list<SubtargetFeature> A78AE = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> A78AE = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
FeatureRCPC, FeaturePerfMon, FeatureSPE,
FeatureSSBS];
- list<SubtargetFeature> A78C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> A78C = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureFullFP16, FeatureDotProd,
FeatureFlagM, FeaturePAuth,
FeaturePerfMon, FeatureRCPC, FeatureSPE,
FeatureSSBS];
list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
FeatureETE, FeatureMTE, FeatureFP16FML,
- FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8];
+ FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8,
+ FeaturePAuth, FeatureFlagM, FeatureSB, FeatureSVE, FeatureSVE2];
list<SubtargetFeature> A715 = [HasV9_0aOps, FeatureNEON, FeatureMTE,
FeatureFP16FML, FeatureSVE, FeatureTRBE,
FeatureSVE2BitPerm, FeatureBF16, FeatureETE,
- FeaturePerfMon, FeatureMatMulInt8, FeatureSPE];
+ FeaturePerfMon, FeatureMatMulInt8, FeatureSPE,
+ FeatureSB, FeatureSSBS, FeatureFullFP16, FeaturePAuth, FeaturePredRes, FeatureFlagM,
+ FeatureSVE2];
list<SubtargetFeature> A720 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
FeatureTRBE, FeatureSVE2BitPerm, FeatureETE,
- FeaturePerfMon, FeatureSPE, FeatureSPE_EEF];
+ FeaturePerfMon, FeatureSPE, FeatureSPE_EEF,
+ FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
+ FeatureSVE, FeatureSVE2];
list<SubtargetFeature> A720AE = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
FeatureTRBE, FeatureSVE2BitPerm, FeatureETE,
- FeaturePerfMon, FeatureSPE, FeatureSPE_EEF];
+ FeaturePerfMon, FeatureSPE, FeatureSPE_EEF,
+ FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
+ FeatureSVE, FeatureSVE2];
+ list<SubtargetFeature> A725 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
+ FeatureETE, FeaturePerfMon, FeatureSPE,
+ FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE,
+ FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS,
+ FeatureSVE, FeatureSVE2];
list<SubtargetFeature> R82 = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
FeatureFP16FML, FeatureSSBS, FeaturePredRes,
FeatureSB, FeatureRDM, FeatureDotProd,
FeatureComplxNum, FeatureJS,
- FeatureCacheDeepPersist];
+ FeatureCacheDeepPersist,
+ FeatureLSE, FeatureFlagM];
list<SubtargetFeature> R82AE = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
FeatureFP16FML, FeatureSSBS, FeaturePredRes,
FeatureSB, FeatureRDM, FeatureDotProd,
FeatureComplxNum, FeatureJS,
- FeatureCacheDeepPersist];
- list<SubtargetFeature> X1 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ FeatureCacheDeepPersist,
+ FeatureLSE, FeatureFlagM];
+ list<SubtargetFeature> X1 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureRCPC, FeaturePerfMon,
FeatureSPE, FeatureFullFP16, FeatureDotProd,
FeatureSSBS];
- list<SubtargetFeature> X1C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> X1C = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureRCPC_IMMO, FeaturePerfMon,
FeatureSPE, FeatureFullFP16, FeatureDotProd,
FeaturePAuth, FeatureSSBS, FeatureFlagM,
- FeatureLSE2];
+ FeatureLSE2,
+ FeatureRCPC];
list<SubtargetFeature> X2 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
FeatureMatMulInt8, FeatureBF16, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
- FeatureFP16FML];
+ FeatureFP16FML,
+ FeaturePAuth, FeatureSSBS, FeatureSB, FeatureSVE, FeatureSVE2, FeatureFlagM];
list<SubtargetFeature> X3 = [HasV9_0aOps, FeatureSVE, FeatureNEON,
FeaturePerfMon, FeatureETE, FeatureTRBE,
FeatureSPE, FeatureBF16, FeatureMatMulInt8,
FeatureMTE, FeatureSVE2BitPerm, FeatureFullFP16,
- FeatureFP16FML];
+ FeatureFP16FML,
+ FeatureSB, FeaturePAuth, FeaturePredRes, FeatureFlagM, FeatureSSBS,
+ FeatureSVE2];
list<SubtargetFeature> X4 = [HasV9_2aOps,
FeaturePerfMon, FeatureETE, FeatureTRBE,
FeatureSPE, FeatureMTE, FeatureSVE2BitPerm,
- FeatureFP16FML, FeatureSPE_EEF];
+ FeatureFP16FML, FeatureSPE_EEF,
+ FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes,
+ FeatureSVE, FeatureSVE2];
+ list<SubtargetFeature> X925 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
+ FeatureETE, FeaturePerfMon, FeatureSPE,
+ FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE,
+ FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS,
+ FeatureSVE, FeatureSVE2];
list<SubtargetFeature> A64FX = [HasV8_2aOps, FeatureFPARMv8, FeatureNEON,
FeatureSHA2, FeaturePerfMon, FeatureFullFP16,
- FeatureSVE, FeatureComplxNum];
- list<SubtargetFeature> Carmel = [HasV8_2aOps, FeatureNEON, FeatureCrypto,
+ FeatureSVE, FeatureComplxNum,
+ FeatureAES];
+ list<SubtargetFeature> Carmel = [HasV8_2aOps, FeatureNEON, FeatureSHA2, FeatureAES,
FeatureFullFP16];
- list<SubtargetFeature> AppleA7 = [HasV8_0aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> AppleA7 = [HasV8_0aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON,FeaturePerfMon, FeatureAppleA7SysReg];
- list<SubtargetFeature> AppleA10 = [HasV8_0aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> AppleA10 = [HasV8_0aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureCRC,
FeatureRDM, FeaturePAN, FeatureLOR, FeatureVH];
- list<SubtargetFeature> AppleA11 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> AppleA11 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureFullFP16];
- list<SubtargetFeature> AppleA12 = [HasV8_3aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> AppleA12 = [HasV8_3aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureFullFP16];
- list<SubtargetFeature> AppleA13 = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> AppleA13 = [HasV8_4aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureFullFP16,
FeatureFP16FML, FeatureSHA3];
- list<SubtargetFeature> AppleA14 = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8,
- FeatureNEON, FeaturePerfMon, FeatureFRInt3264,
- FeatureSpecRestrict, FeatureSSBS, FeatureSB,
- FeaturePredRes, FeatureCacheDeepPersist,
+ list<SubtargetFeature> AppleA14 = [HasV8_4aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
+ FeatureNEON, FeaturePerfMon,
FeatureFullFP16, FeatureFP16FML, FeatureSHA3,
- FeatureAltFPCmp];
- list<SubtargetFeature> AppleA15 = [HasV8_6aOps, FeatureCrypto, FeatureFPARMv8,
+ // ArmV8.5-a extensions, excluding BTI:
+ FeatureAltFPCmp, FeatureFRInt3264,
+ FeatureSpecRestrict, FeatureSSBS, FeatureSB,
+ FeaturePredRes, FeatureCacheDeepPersist];
+ list<SubtargetFeature> AppleA15 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSHA3,
FeatureFullFP16, FeatureFP16FML];
- list<SubtargetFeature> AppleA16 = [HasV8_6aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> AppleA16 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSHA3,
FeatureFullFP16, FeatureFP16FML,
FeatureHCX];
- list<SubtargetFeature> AppleA17 = [HasV8_6aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> AppleA17 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSHA3,
FeatureFullFP16, FeatureFP16FML,
FeatureHCX];
- list<SubtargetFeature> ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
+ // Technically apple-m4 is ARMv9.2a, but a quirk of LLVM defines v9.0 as
+ // requiring SVE, which is optional according to the Arm ARM and not
+ // supported by the core. ARMv8.7a is the next closest choice.
+ list<SubtargetFeature> AppleM4 = [HasV8_7aOps, FeatureSHA2, FeatureFPARMv8,
+ FeatureNEON, FeaturePerfMon, FeatureSHA3,
+ FeatureFullFP16, FeatureFP16FML,
+ FeatureAES, FeatureBF16,
+ FeatureSME, FeatureSME2,
+ FeatureSMEF64F64, FeatureSMEI16I64];
+ list<SubtargetFeature> ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES,
FeaturePerfMon];
- list<SubtargetFeature> ExynosM4 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
+ list<SubtargetFeature> ExynosM4 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureDotProd,
FeatureFullFP16, FeaturePerfMon];
- list<SubtargetFeature> Falkor = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
+ list<SubtargetFeature> Falkor = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES,
FeatureFPARMv8, FeatureNEON, FeaturePerfMon,
FeatureRDM];
- list<SubtargetFeature> NeoverseE1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
+ list<SubtargetFeature> NeoverseE1 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureDotProd,
FeatureFPARMv8, FeatureFullFP16, FeatureNEON,
FeatureRCPC, FeatureSSBS, FeaturePerfMon];
- list<SubtargetFeature> NeoverseN1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
+ list<SubtargetFeature> NeoverseN1 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureDotProd,
FeatureFPARMv8, FeatureFullFP16, FeatureNEON,
FeatureRCPC, FeatureSPE, FeatureSSBS,
FeaturePerfMon];
list<SubtargetFeature> NeoverseN2 = [HasV9_0aOps, FeatureBF16, FeatureETE, FeatureFP16FML,
FeatureMatMulInt8, FeatureMTE, FeatureSVE2,
FeatureSVE2BitPerm, FeatureTRBE,
- FeaturePerfMon];
+ FeaturePerfMon,
+ FeatureDotProd, FeatureFullFP16, FeatureSB, FeatureSSBS, FeatureSVE];
list<SubtargetFeature> NeoverseN3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
FeatureFullFP16, FeatureMTE, FeaturePerfMon,
FeatureRandGen, FeatureSPE, FeatureSPE_EEF,
- FeatureSVE2BitPerm];
+ FeatureSVE2BitPerm,
+ FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM,
+ FeatureSVE, FeatureSVE2];
list<SubtargetFeature> Neoverse512TVB = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist,
- FeatureCrypto, FeatureFPARMv8, FeatureFP16FML,
+ FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureFP16FML,
FeatureFullFP16, FeatureMatMulInt8, FeatureNEON,
FeaturePerfMon, FeatureRandGen, FeatureSPE,
- FeatureSSBS, FeatureSVE];
+ FeatureSSBS, FeatureSVE,
+ FeatureSHA3, FeatureSM4, FeatureDotProd];
list<SubtargetFeature> NeoverseV1 = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist,
- FeatureCrypto, FeatureFPARMv8, FeatureFP16FML,
+ FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureFP16FML,
FeatureFullFP16, FeatureMatMulInt8, FeatureNEON,
FeaturePerfMon, FeatureRandGen, FeatureSPE,
- FeatureSSBS, FeatureSVE];
+ FeatureSSBS, FeatureSVE,
+ FeatureSHA3, FeatureSM4, FeatureDotProd];
list<SubtargetFeature> NeoverseV2 = [HasV9_0aOps, FeatureBF16, FeatureSPE,
FeaturePerfMon, FeatureETE, FeatureMatMulInt8,
FeatureNEON, FeatureSVE2BitPerm, FeatureFP16FML,
- FeatureMTE, FeatureRandGen];
+ FeatureMTE, FeatureRandGen,
+ FeatureSVE, FeatureSVE2, FeatureSSBS, FeatureFullFP16, FeatureDotProd];
list<SubtargetFeature> NeoverseV3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
FeatureFullFP16, FeatureLS64, FeatureMTE,
FeaturePerfMon, FeatureRandGen, FeatureSPE,
- FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE];
+ FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE,
+ FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM,
+ FeatureSVE, FeatureSVE2];
list<SubtargetFeature> NeoverseV3AE = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
FeatureFullFP16, FeatureLS64, FeatureMTE,
FeaturePerfMon, FeatureRandGen, FeatureSPE,
- FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE];
- list<SubtargetFeature> Saphira = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8,
+ FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE,
+ FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM,
+ FeatureSVE, FeatureSVE2];
+ list<SubtargetFeature> Saphira = [HasV8_4aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeatureSPE, FeaturePerfMon];
- list<SubtargetFeature> ThunderX = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
+ list<SubtargetFeature> ThunderX = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES,
FeatureFPARMv8, FeaturePerfMon, FeatureNEON];
- list<SubtargetFeature> ThunderX2T99 = [HasV8_1aOps, FeatureCRC, FeatureCrypto,
+ list<SubtargetFeature> ThunderX2T99 = [HasV8_1aOps, FeatureCRC, FeatureSHA2, FeatureAES,
FeatureFPARMv8, FeatureNEON, FeatureLSE];
- list<SubtargetFeature> ThunderX3T110 = [HasV8_3aOps, FeatureCRC, FeatureCrypto,
+ list<SubtargetFeature> ThunderX3T110 = [HasV8_3aOps, FeatureCRC, FeatureSHA2, FeatureAES,
FeatureFPARMv8, FeatureNEON, FeatureLSE,
FeaturePAuth, FeaturePerfMon];
- list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSPE,
FeatureFullFP16, FeatureFP16FML, FeatureDotProd,
FeatureJS, FeatureComplxNum];
list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
FeatureSSBS, FeatureRandGen, FeatureSB,
- FeatureSHA2, FeatureSHA3, FeatureAES];
+ FeatureSHA2, FeatureSHA3, FeatureAES,
+ FeatureFullFP16];
list<SubtargetFeature> Ampere1A = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
FeatureMTE, FeatureSSBS, FeatureRandGen,
FeatureSB, FeatureSM4, FeatureSHA2,
- FeatureSHA3, FeatureAES];
+ FeatureSHA3, FeatureAES,
+ FeatureFullFP16];
list<SubtargetFeature> Ampere1B = [HasV8_7aOps, FeatureNEON, FeaturePerfMon,
FeatureMTE, FeatureSSBS, FeatureRandGen,
FeatureSB, FeatureSM4, FeatureSHA2,
@@ -828,9 +918,10 @@ def ProcessorFeatures {
FeatureWFxT, FeatureFullFP16];
list<SubtargetFeature> Oryon = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
- FeatureCrypto, FeatureRandGen,
+ FeatureRandGen,
FeaturePAuth, FeatureSM4, FeatureSHA2,
- FeatureSHA3, FeatureAES];
+ FeatureSHA3, FeatureAES,
+ FeatureSPE];
// ETE and TRBE are future architecture extensions. We temporarily enable them
// by default for users targeting generic AArch64. The extensions do not
@@ -890,6 +981,8 @@ def : ProcessorModel<"cortex-a720", NeoverseN2Model, ProcessorFeatures.A720,
[TuneA720]>;
def : ProcessorModel<"cortex-a720ae", NeoverseN2Model, ProcessorFeatures.A720AE,
[TuneA720AE]>;
+def : ProcessorModel<"cortex-a725", NeoverseN2Model, ProcessorFeatures.A725,
+ [TuneA725]>;
def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82,
[TuneR82]>;
def : ProcessorModel<"cortex-r82ae", CortexA55Model, ProcessorFeatures.R82AE,
@@ -904,6 +997,8 @@ def : ProcessorModel<"cortex-x3", NeoverseN2Model, ProcessorFeatures.X3,
[TuneX3]>;
def : ProcessorModel<"cortex-x4", NeoverseN2Model, ProcessorFeatures.X4,
[TuneX4]>;
+def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925,
+ [TuneX925]>;
def : ProcessorModel<"neoverse-e1", CortexA53Model,
ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>;
def : ProcessorModel<"neoverse-n1", NeoverseN1Model,
@@ -952,50 +1047,60 @@ def : ProcessorModel<"thunderx3t110", ThunderX3T110Model,
def : ProcessorModel<"tsv110", TSV110Model, ProcessorFeatures.TSV110,
[TuneTSV110]>;
+
+// Apple CPUs
+
// Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
def : ProcessorModel<"cyclone", CycloneModel, ProcessorFeatures.AppleA7,
[TuneAppleA7]>;
-
-// iPhone and iPad CPUs
def : ProcessorModel<"apple-a7", CycloneModel, ProcessorFeatures.AppleA7,
[TuneAppleA7]>;
def : ProcessorModel<"apple-a8", CycloneModel, ProcessorFeatures.AppleA7,
[TuneAppleA7]>;
def : ProcessorModel<"apple-a9", CycloneModel, ProcessorFeatures.AppleA7,
[TuneAppleA7]>;
+
def : ProcessorModel<"apple-a10", CycloneModel, ProcessorFeatures.AppleA10,
[TuneAppleA10]>;
+
def : ProcessorModel<"apple-a11", CycloneModel, ProcessorFeatures.AppleA11,
[TuneAppleA11]>;
+
def : ProcessorModel<"apple-a12", CycloneModel, ProcessorFeatures.AppleA12,
[TuneAppleA12]>;
+def : ProcessorModel<"apple-s4", CycloneModel, ProcessorFeatures.AppleA12,
+ [TuneAppleA12]>;
+def : ProcessorModel<"apple-s5", CycloneModel, ProcessorFeatures.AppleA12,
+ [TuneAppleA12]>;
+
def : ProcessorModel<"apple-a13", CycloneModel, ProcessorFeatures.AppleA13,
[TuneAppleA13]>;
+
def : ProcessorModel<"apple-a14", CycloneModel, ProcessorFeatures.AppleA14,
[TuneAppleA14]>;
-def : ProcessorModel<"apple-a15", CycloneModel, ProcessorFeatures.AppleA15,
- [TuneAppleA15]>;
-def : ProcessorModel<"apple-a16", CycloneModel, ProcessorFeatures.AppleA16,
- [TuneAppleA16]>;
-def : ProcessorModel<"apple-a17", CycloneModel, ProcessorFeatures.AppleA17,
- [TuneAppleA17]>;
-// Mac CPUs
def : ProcessorModel<"apple-m1", CycloneModel, ProcessorFeatures.AppleA14,
[TuneAppleA14]>;
+
+def : ProcessorModel<"apple-a15", CycloneModel, ProcessorFeatures.AppleA15,
+ [TuneAppleA15]>;
def : ProcessorModel<"apple-m2", CycloneModel, ProcessorFeatures.AppleA15,
[TuneAppleA15]>;
+
+def : ProcessorModel<"apple-a16", CycloneModel, ProcessorFeatures.AppleA16,
+ [TuneAppleA16]>;
def : ProcessorModel<"apple-m3", CycloneModel, ProcessorFeatures.AppleA16,
[TuneAppleA16]>;
-// watch CPUs.
-def : ProcessorModel<"apple-s4", CycloneModel, ProcessorFeatures.AppleA12,
- [TuneAppleA12]>;
-def : ProcessorModel<"apple-s5", CycloneModel, ProcessorFeatures.AppleA12,
- [TuneAppleA12]>;
+def : ProcessorModel<"apple-a17", CycloneModel, ProcessorFeatures.AppleA17,
+ [TuneAppleA17]>;
+
+def : ProcessorModel<"apple-m4", CycloneModel, ProcessorFeatures.AppleM4,
+ [TuneAppleM4]>;
// Alias for the latest Apple processor model supported by LLVM.
-def : ProcessorModel<"apple-latest", CycloneModel, ProcessorFeatures.AppleA16,
- [TuneAppleA16]>;
+def : ProcessorModel<"apple-latest", CycloneModel, ProcessorFeatures.AppleM4,
+ [TuneAppleM4]>;
+
// Fujitsu A64FX
def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2b70c4715bf9..054eca8ad752 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -31,6 +31,27 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64AllocateZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1,
+ [SDTCisInt<0>, SDTCisInt<1>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
+ def AllocateZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {}
+}
+def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)),
+ (AllocateZABuffer $size)>;
+
+def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1,
+ [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>;
+let usesCustomInserter = 1 in {
+ def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {}
+}
+
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
@@ -50,15 +71,17 @@ def AArch64CoalescerBarrier
def SDT_AArch64RDSVL : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>;
def AArch64rdsvl : SDNode<"AArch64ISD::RDSVL", SDT_AArch64RDSVL>;
-let Predicates = [HasSME] in {
+let Predicates = [HasSMEandIsNonStreamingSafe] in {
def RDSVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdsvl", /*streaming_sve=*/0b1>;
def ADDSPL_XXI : sve_int_arith_vl<0b1, "addspl", /*streaming_sve=*/0b1>;
def ADDSVL_XXI : sve_int_arith_vl<0b0, "addsvl", /*streaming_sve=*/0b1>;
+def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>;
+}
+
+let Predicates = [HasSME] in {
defm ADDHA_MPPZ_S : sme_add_vector_to_tile_u32<0b0, "addha", int_aarch64_sme_addha>;
defm ADDVA_MPPZ_S : sme_add_vector_to_tile_u32<0b1, "addva", int_aarch64_sme_addva>;
-
-def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>;
}
let Predicates = [HasSMEI16I64] in {
@@ -117,18 +140,20 @@ defm LD1_MXIPXX : sme_mem_ld_ss<"ld1">;
defm ST1_MXIPXX : sme_mem_st_ss<"st1">;
//===----------------------------------------------------------------------===//
-// Spill + fill
+// Move instructions
//===----------------------------------------------------------------------===//
-defm LDR_ZA : sme_fill<"ldr">;
-defm STR_ZA : sme_spill<"str">;
+defm INSERT_MXIPZ : sme_vector_to_tile<"mova">;
+defm EXTRACT_ZPMXI : sme_tile_to_vector<"mova">;
+} // End let Predicates = [HasSME]
+let Predicates = [HasSMEandIsNonStreamingSafe] in {
//===----------------------------------------------------------------------===//
-// Move instructions
+// Spill + fill
//===----------------------------------------------------------------------===//
-defm INSERT_MXIPZ : sme_vector_to_tile<"mova">;
-defm EXTRACT_ZPMXI : sme_tile_to_vector<"mova">;
+defm LDR_ZA : sme_fill<"ldr">;
+defm STR_ZA : sme_spill<"str">;
//===----------------------------------------------------------------------===//
// Zero instruction
@@ -164,7 +189,7 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
(MRS 0xde85)>;
-} // End let Predicates = [HasSME]
+} // End let Predicates = [HasSMEandIsNonStreamingSafe]
multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> {
def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> {
@@ -221,6 +246,15 @@ def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)),
(MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
+// Pseudo to insert cfi_offset/cfi_restore instructions. Used to save or restore
+// the streaming value of VG around streaming-mode changes in locally-streaming
+// functions.
+def VGSavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+def : Pat<(AArch64VGSave), (VGSavePseudo)>;
+
+def VGRestorePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+def : Pat<(AArch64VGRestore), (VGRestorePseudo)>;
+
//===----------------------------------------------------------------------===//
// SME2 Instructions
//===----------------------------------------------------------------------===//
@@ -550,11 +584,6 @@ defm SMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"smops", 0b001, int_aarch64_sme_smops
defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100, int_aarch64_sme_umopa_za32>;
defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops_za32>;
-defm ZERO_T : sme2_zero_zt<"zero", 0b0001>;
-
-defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, AArch64_restore_zt>;
-defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, AArch64_save_zt>;
-
def MOVT_XTI : sme2_movt_zt_to_scalar<"movt", 0b0011111>;
def MOVT_TIX : sme2_movt_scalar_to_zt<"movt", 0b0011111>;
@@ -680,7 +709,15 @@ def STNT1D_2Z_STRIDED : sme2_st_vector_vg2_multi_scalar_scalar<0b11, 0b1,
def STNT1D_4Z_STRIDED : sme2_st_vector_vg4_multi_scalar_scalar<0b11, 0b1, ZZZZ_d_strided, GPR64shifted64, "stnt1d">;
defm STNT1D_2Z_STRIDED_IMM : sme2_st_vector_vg2_multi_scalar_immediate<0b11, 0b1, ZZ_d_strided, simm4s2, "stnt1d">;
defm STNT1D_4Z_STRIDED_IMM : sme2_st_vector_vg4_multi_scalar_immediate<0b11, 0b1, ZZZZ_d_strided, simm4s4, "stnt1d">;
-}
+} // End let Predicates = [HasSME2]
+
+
+let Predicates = [HasSME2andIsNonStreamingSafe] in {
+defm ZERO_T : sme2_zero_zt<"zero", 0b0001>;
+
+defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, AArch64_restore_zt>;
+defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, AArch64_save_zt>;
+} // End let Predicates = [HasSME2andIsNonStreamingSafe]
let Predicates = [HasSME2, HasSMEI16I64] in {
defm ADD_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"add", 0b1011010, MatrixOp64, ZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_add_write_single_za_vg1x2>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bd5de628d852..a3c41f2e052c 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3351,7 +3351,7 @@ let Predicates = [HasSVEorSME] in {
(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
// Extract element from vector with immediate index that's within the bottom 128-bits.
- let Predicates = [IsNeonAvailable], AddedComplexity = 1 in {
+ let Predicates = [HasNEON], AddedComplexity = 1 in {
def : Pat<(i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)),
(UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
def : Pat<(i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)),
@@ -3360,9 +3360,9 @@ let Predicates = [HasSVEorSME] in {
(UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
def : Pat<(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)),
(UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>;
- } // End IsNeonAvailable
+ } // End HasNEON
- let Predicates = [IsNeonAvailable] in {
+ let Predicates = [HasNEON] in {
def : Pat<(sext_inreg (vector_extract nxv16i8:$vec, VectorIndexB:$index), i8),
(SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
def : Pat<(sext_inreg (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index))), i8),
@@ -3375,7 +3375,7 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
(SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
- } // End IsNeonAvailable
+ } // End HasNEON
// Extract first element from vector.
let AddedComplexity = 2 in {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 93ea729e2550..1fad1d5ca6d7 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -153,9 +153,11 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case CortexA710:
case CortexA715:
case CortexA720:
+ case CortexA725:
case CortexX2:
case CortexX3:
case CortexX4:
+ case CortexX925:
PrefFunctionAlignment = Align(16);
VScaleForTuning = 1;
PrefLoopAlignment = Align(32);
@@ -180,6 +182,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case AppleA15:
case AppleA16:
case AppleA17:
+ case AppleM4:
CacheLineSize = 64;
PrefetchDistance = 280;
MinPrefetchStride = 2048;
@@ -189,6 +192,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case AppleA15:
case AppleA16:
case AppleA17:
+ case AppleM4:
MaxInterleaveFactor = 4;
break;
default:
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 7ef7a89b5749..5faba09aa67b 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -185,6 +185,12 @@ public:
(hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
}
+ /// Returns true if the target has access to either the full range of SVE instructions,
+ /// or the streaming-compatible subset of SVE instructions.
+ bool isSVEorStreamingSVEAvailable() const {
+ return hasSVE() || (hasSME() && isStreaming());
+ }
+
unsigned getMinVectorRegisterBitWidth() const {
// Don't assume any minimum vector size when PSTATE.SM may not be 0, because
// we don't yet support streaming-compatible codegen support that we trust
@@ -355,30 +361,27 @@ public:
void mirFileLoaded(MachineFunction &MF) const override;
- bool hasSVEorSME() const { return hasSVE() || hasSME(); }
- bool hasSVE2orSME() const { return hasSVE2() || hasSME(); }
-
// Return the known range for the bit length of SVE data registers. A value
// of 0 means nothing is known about that particular limit beyong what's
// implied by the architecture.
unsigned getMaxSVEVectorSizeInBits() const {
- assert(hasSVEorSME() &&
+ assert(isSVEorStreamingSVEAvailable() &&
"Tried to get SVE vector length without SVE support!");
return MaxSVEVectorSizeInBits;
}
unsigned getMinSVEVectorSizeInBits() const {
- assert(hasSVEorSME() &&
+ assert(isSVEorStreamingSVEAvailable() &&
"Tried to get SVE vector length without SVE support!");
return MinSVEVectorSizeInBits;
}
bool useSVEForFixedLengthVectors() const {
- if (!isNeonAvailable())
- return hasSVEorSME();
+ if (!isSVEorStreamingSVEAvailable())
+ return false;
// Prefer NEON unless larger SVE registers are available.
- return hasSVEorSME() && getMinSVEVectorSizeInBits() >= 256;
+ return !isNeonAvailable() || getMinSVEVectorSizeInBits() >= 256;
}
bool useSVEForFixedLengthVectors(EVT VT) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 7de9071476e7..37ce07d4a09d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -187,6 +187,11 @@ static cl::opt<unsigned> SVEVectorBitsMinOpt(
"with zero meaning no minimum size is assumed."),
cl::init(0), cl::Hidden);
+static cl::opt<bool> ForceStreaming(
+ "force-streaming",
+ cl::desc("Force the use of streaming code for all functions"),
+ cl::init(false), cl::Hidden);
+
static cl::opt<bool> ForceStreamingCompatible(
"force-streaming-compatible",
cl::desc("Force the use of streaming-compatible code for all functions"),
@@ -412,11 +417,11 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : TargetFS;
bool HasMinSize = F.hasMinSize();
- bool IsStreaming = F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
+ bool IsStreaming = ForceStreaming ||
+ F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
F.hasFnAttribute("aarch64_pstate_sm_body");
- bool IsStreamingCompatible =
- F.hasFnAttribute("aarch64_pstate_sm_compatible") ||
- ForceStreamingCompatible;
+ bool IsStreamingCompatible = ForceStreamingCompatible ||
+ F.hasFnAttribute("aarch64_pstate_sm_compatible");
unsigned MinSVEVectorSize = 0;
unsigned MaxSVEVectorSize = 0;
@@ -549,8 +554,7 @@ public:
} // end anonymous namespace
-void AArch64TargetMachine::registerPassBuilderCallbacks(
- PassBuilder &PB, bool PopulateClassToPassNames) {
+void AArch64TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerLateLoopOptimizationsEPCallback(
[=](LoopPassManager &LPM, OptimizationLevel Level) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index e396d9204716..1a470ca87127 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -43,8 +43,7 @@ public:
// Pass Pipeline Configuration
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool PopulateClassToPassNames) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f5756fc7e40..9a0eb45b875d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -985,6 +985,33 @@ static bool isAllActivePredicate(Value *Pred) {
m_ConstantInt<AArch64SVEPredPattern::all>()));
}
+// Simplify unary operation where predicate has all inactive lanes by replacing
+// instruction with zeroed object
+static std::optional<Instruction *>
+instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) {
+ if (match(II.getOperand(0), m_ZeroInt())) {
+ Constant *Node;
+ Type *RetTy = II.getType();
+ if (RetTy->isStructTy()) {
+ auto StructT = cast<StructType>(RetTy);
+ auto VecT = StructT->getElementType(0);
+ SmallVector<llvm::Constant *, 4> ZerVec;
+ for (unsigned i = 0; i < StructT->getNumElements(); i++) {
+ ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
+ : ConstantInt::get(VecT, 0));
+ }
+ Node = ConstantStruct::get(StructT, ZerVec);
+ } else if (RetTy->isFPOrFPVectorTy())
+ Node = ConstantFP::get(RetTy, 0.0);
+ else
+ Node = ConstantInt::get(II.getType(), 0);
+
+ IC.replaceInstUsesWith(II, Node);
+ return IC.eraseInstFromFunction(II);
+ }
+ return std::nullopt;
+}
+
static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
IntrinsicInst &II) {
// svsel(ptrue, x, y) => x
@@ -1398,6 +1425,10 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Value *PtrOp = II.getOperand(1);
Type *VecTy = II.getType();
+ // Replace by zero constant when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
+ return II_NA;
+
if (isAllActivePredicate(Pred)) {
LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
Load->copyMetadata(II);
@@ -1745,6 +1776,10 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
Type *Ty = II.getType();
Value *PassThru = ConstantAggregateZero::get(Ty);
+ // Replace by zero constant when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
+ return II_NA;
+
// Contiguous gather => masked load.
// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
@@ -1971,6 +2006,41 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
switch (IID) {
default:
break;
+
+ case Intrinsic::aarch64_sve_ld1_gather:
+ case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ld1_gather_sxtw:
+ case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_ld1_gather_uxtw:
+ case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_ld1q_gather_index:
+ case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
+ case Intrinsic::aarch64_sve_ld1ro:
+ case Intrinsic::aarch64_sve_ld1rq:
+ case Intrinsic::aarch64_sve_ld1udq:
+ case Intrinsic::aarch64_sve_ld1uwq:
+ case Intrinsic::aarch64_sve_ld2_sret:
+ case Intrinsic::aarch64_sve_ld2q_sret:
+ case Intrinsic::aarch64_sve_ld3_sret:
+ case Intrinsic::aarch64_sve_ld3q_sret:
+ case Intrinsic::aarch64_sve_ld4_sret:
+ case Intrinsic::aarch64_sve_ld4q_sret:
+ case Intrinsic::aarch64_sve_ldff1:
+ case Intrinsic::aarch64_sve_ldff1_gather:
+ case Intrinsic::aarch64_sve_ldff1_gather_index:
+ case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_ldnf1:
+ case Intrinsic::aarch64_sve_ldnt1:
+ case Intrinsic::aarch64_sve_ldnt1_gather:
+ case Intrinsic::aarch64_sve_ldnt1_gather_index:
+ case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
+ return instCombineSVENoActiveUnaryZero(IC, II);
case Intrinsic::aarch64_neon_fmaxnm:
case Intrinsic::aarch64_neon_fminnm:
return instCombineMaxMinNM(IC, II);
@@ -2162,19 +2232,20 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(64);
case TargetTransformInfo::RGK_FixedWidthVector:
- if (!ST->isNeonAvailable() && !EnableFixedwidthAutovecInStreamingMode)
- return TypeSize::getFixed(0);
-
- if (ST->hasSVE())
+ if (ST->useSVEForFixedLengthVectors() &&
+ (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
return TypeSize::getFixed(
std::max(ST->getMinSVEVectorSizeInBits(), 128u));
-
- return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
+ else if (ST->isNeonAvailable())
+ return TypeSize::getFixed(128);
+ else
+ return TypeSize::getFixed(0);
case TargetTransformInfo::RGK_ScalableVector:
- if (!ST->isSVEAvailable() && !EnableScalableAutovecInStreamingMode)
+ if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
+ EnableScalableAutovecInStreamingMode))
+ return TypeSize::getScalable(128);
+ else
return TypeSize::getScalable(0);
-
- return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
}
llvm_unreachable("Unsupported register kind");
}
@@ -2690,7 +2761,8 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
return AdjustCost(Entry->Cost);
if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
- CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
+ CCH == TTI::CastContextHint::Masked &&
+ ST->isSVEorStreamingSVEAvailable() &&
TLI->getTypeAction(Src->getContext(), SrcTy) ==
TargetLowering::TypePromoteInteger &&
TLI->getTypeAction(Dst->getContext(), DstTy) ==
@@ -2711,8 +2783,8 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
// The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
// but we also want to include the TTI::CastContextHint::Masked case too.
if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
- CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
- TLI->isTypeLegal(DstTy))
+ CCH == TTI::CastContextHint::Masked &&
+ ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
CCH = TTI::CastContextHint::Normal;
return AdjustCost(
@@ -3187,11 +3259,16 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
if (!LT.first.isValid())
return InstructionCost::getInvalid();
+ // Return an invalid cost for element types that we are unable to lower.
+ auto *VT = cast<VectorType>(Src);
+ if (VT->getElementType()->isIntegerTy(1))
+ return InstructionCost::getInvalid();
+
// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
- if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
+ if (VT->getElementCount() == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
return LT.first;
@@ -3212,16 +3289,17 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
if (!LT.first.isValid())
return InstructionCost::getInvalid();
+ // Return an invalid cost for element types that we are unable to lower.
if (!LT.second.isVector() ||
- !isElementTypeLegalForScalableVector(VT->getElementType()))
+ !isElementTypeLegalForScalableVector(VT->getElementType()) ||
+ VT->getElementType()->isIntegerTy(1))
return InstructionCost::getInvalid();
// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
- if (cast<VectorType>(DataTy)->getElementCount() ==
- ElementCount::getScalable(1))
+ if (VT->getElementCount() == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
ElementCount LegalVF = LT.second.getVectorElementCount();
@@ -3259,8 +3337,12 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
+ // We also only support full register predicate loads and stores.
if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
- if (VTy->getElementCount() == ElementCount::getScalable(1))
+ if (VTy->getElementCount() == ElementCount::getScalable(1) ||
+ (VTy->getElementType()->isIntegerTy(1) &&
+ !VTy->getElementCount().isKnownMultipleOf(
+ ElementCount::getScalable(16))))
return InstructionCost::getInvalid();
// TODO: consider latency as well for TCK_SizeAndLatency.
@@ -4234,4 +4316,4 @@ bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
-} \ No newline at end of file
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index feec1a4289c3..1180225ce009 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -248,7 +248,7 @@ public:
if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
return true;
- if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
+ if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
Ty->isIntegerTy(32) || Ty->isIntegerTy(64))
return true;
@@ -276,7 +276,7 @@ public:
}
bool isLegalMaskedGatherScatter(Type *DataType) const {
- if (!ST->hasSVE() || !ST->isNeonAvailable())
+ if (!ST->isSVEAvailable())
return false;
// For fixed vectors, scalarize if not using SVE for them.
@@ -373,9 +373,11 @@ public:
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
- bool supportsScalableVectors() const { return ST->hasSVE(); }
+ bool supportsScalableVectors() const {
+ return ST->isSVEorStreamingSVEAvailable();
+ }
- bool enableScalableVectorization() const { return ST->hasSVE(); }
+ bool enableScalableVectorization() const { return ST->isSVEAvailable(); }
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
ElementCount VF) const;
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 57e4f6d298d8..5e17ed40df8a 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -8094,8 +8094,8 @@ ParseStatus AArch64AsmParser::tryParseImmRange(OperandVector &Operands) {
if (getParser().parseExpression(ImmL))
return ParseStatus::NoMatch;
- unsigned ImmFVal = dyn_cast<MCConstantExpr>(ImmF)->getValue();
- unsigned ImmLVal = dyn_cast<MCConstantExpr>(ImmL)->getValue();
+ unsigned ImmFVal = cast<MCConstantExpr>(ImmF)->getValue();
+ unsigned ImmLVal = cast<MCConstantExpr>(ImmL)->getValue();
Operands.push_back(
AArch64Operand::CreateImmRange(ImmFVal, ImmLVal, S, E, getContext()));
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 270474f80767..1fb50a089ea3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -53,6 +53,8 @@
using namespace llvm;
using namespace AArch64GISelUtils;
+extern cl::opt<bool> EnableSVEGISel;
+
AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
: CallLowering(&TLI) {}
@@ -404,14 +406,13 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
ExtendOp = TargetOpcode::G_ZEXT;
LLT NewLLT(NewVT);
- LLT OldLLT(MVT::getVT(CurArgInfo.Ty));
+ LLT OldLLT = getLLTForType(*CurArgInfo.Ty, DL);
CurArgInfo.Ty = EVT(NewVT).getTypeForEVT(Ctx);
// Instead of an extend, we might have a vector type which needs
// padding with more elements, e.g. <2 x half> -> <4 x half>.
if (NewVT.isVector()) {
if (OldLLT.isVector()) {
if (NewLLT.getNumElements() > OldLLT.getNumElements()) {
-
CurVReg =
MIRBuilder.buildPadVectorWithUndefElements(NewLLT, CurVReg)
.getReg(0);
@@ -525,10 +526,10 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const {
auto &F = MF.getFunction();
- if (F.getReturnType()->isScalableTy() ||
- llvm::any_of(F.args(), [](const Argument &A) {
- return A.getType()->isScalableTy();
- }))
+ if (!EnableSVEGISel && (F.getReturnType()->isScalableTy() ||
+ llvm::any_of(F.args(), [](const Argument &A) {
+ return A.getType()->isScalableTy();
+ })))
return true;
const auto &ST = MF.getSubtarget<AArch64Subtarget>();
if (!ST.hasNEON() || !ST.hasFPARMv8()) {
@@ -1022,7 +1023,7 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
if (!IsTailCall) {
if (!PAI)
- return IsIndirect ? getBLRCallOpcode(CallerF) : AArch64::BL;
+ return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL;
assert(IsIndirect && "Direct call should not be authenticated");
assert((PAI->Key == AArch64PACKey::IA || PAI->Key == AArch64PACKey::IB) &&
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 4a7c82b393c1..0357a7206c47 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -597,8 +597,14 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
/// Given a register bank, and size in bits, return the smallest register class
/// that can represent that combination.
static const TargetRegisterClass *
-getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
+getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
bool GetAllRegSet = false) {
+ if (SizeInBits.isScalable()) {
+ assert(RB.getID() == AArch64::FPRRegBankID &&
+ "Expected FPR regbank for scalable type size");
+ return &AArch64::ZPRRegClass;
+ }
+
unsigned RegBankID = RB.getID();
if (RegBankID == AArch64::GPRRegBankID) {
@@ -939,8 +945,9 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
- unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
- unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+
+ TypeSize DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+ TypeSize SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
// Special casing for cross-bank copies of s1s. We can technically represent
// a 1-bit value with any size of register. The minimum size for a GPR is 32
@@ -951,7 +958,7 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
// register bank. Or make a new helper that carries along some constraint
// information.
if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
- SrcSize = DstSize = 32;
+ SrcSize = DstSize = TypeSize::getFixed(32);
return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
getMinClassForRegBank(DstRegBank, DstSize, true)};
@@ -1016,8 +1023,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
return false;
}
- unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
- unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
+ const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC);
+ const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC);
unsigned SubReg;
// If the source bank doesn't support a subregister copy small enough,
@@ -4660,7 +4667,6 @@ MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
Register LHS, Register RHS, CmpInst::Predicate CC,
AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
MachineIRBuilder &MIB) const {
- // TODO: emit CMN as an optimization.
auto &MRI = *MIB.getMRI();
LLT OpTy = MRI.getType(LHS);
unsigned CCmpOpc;
@@ -4668,10 +4674,12 @@ MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
if (CmpInst::isIntPredicate(CC)) {
assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
C = getIConstantVRegValWithLookThrough(RHS, MRI);
- if (C && C->Value.ult(32))
+ if (!C || C->Value.sgt(31) || C->Value.slt(-31))
+ CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
+ else if (C->Value.ule(31))
CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
else
- CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
+ CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi;
} else {
assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
OpTy.getSizeInBits() == 64);
@@ -4696,6 +4704,8 @@ MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
MIB.buildInstr(CCmpOpc, {}, {LHS});
if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
CCmp.addImm(C->Value.getZExtValue());
+ else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi)
+ CCmp.addImm(C->Value.abs().getZExtValue());
else
CCmp.addReg(RHS);
CCmp.addImm(NZCV).addImm(Predicate);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 42cd43c3afa3..fef0b722efe4 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -661,7 +661,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
// Conversions
getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
- .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
+ .legalFor({{s32, s32},
+ {s64, s32},
+ {s32, s64},
+ {s64, s64},
+ {v2s64, v2s64},
+ {v4s32, v4s32},
+ {v2s32, v2s32}})
.legalIf([=](const LegalityQuery &Query) {
return HasFP16 &&
(Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
@@ -669,26 +675,38 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
(Query.Types[0] == s32 || Query.Types[0] == s64 ||
Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
})
- .widenScalarToNextPow2(0)
- .clampScalar(0, s32, s64)
- .widenScalarToNextPow2(1)
- .clampScalarOrElt(1, MinFPScalar, s64)
+ .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+ .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
+ // The range of a fp16 value fits into an i17, so we can lower the width
+ // to i64.
+ .narrowScalarIf(
+ [=](const LegalityQuery &Query) {
+ return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
+ },
+ changeTo(0, s64))
.moreElementsToNextPow2(0)
+ .widenScalarOrEltToNextPow2OrMinSize(0)
+ .minScalar(0, s32)
+ .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
.widenScalarIf(
[=](const LegalityQuery &Query) {
- return Query.Types[0].getScalarSizeInBits() >
- Query.Types[1].getScalarSizeInBits();
+ return Query.Types[0].getScalarSizeInBits() <= 64 &&
+ Query.Types[0].getScalarSizeInBits() >
+ Query.Types[1].getScalarSizeInBits();
},
LegalizeMutations::changeElementSizeTo(1, 0))
.widenScalarIf(
[=](const LegalityQuery &Query) {
- return Query.Types[0].getScalarSizeInBits() <
- Query.Types[1].getScalarSizeInBits();
+ return Query.Types[1].getScalarSizeInBits() <= 64 &&
+ Query.Types[0].getScalarSizeInBits() <
+ Query.Types[1].getScalarSizeInBits();
},
LegalizeMutations::changeElementSizeTo(0, 1))
.clampNumElements(0, v4s16, v8s16)
.clampNumElements(0, v2s32, v4s32)
- .clampMaxNumElements(0, s64, 2);
+ .clampMaxNumElements(0, s64, 2)
+ .libcallFor(
+ {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
.legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
index 17dd8f2314a2..0ba3a543d114 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
@@ -165,6 +165,10 @@ bool AArch64O0PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
/*LegalizerInfo*/ nullptr, /*EnableOpt*/ false,
F.hasOptSize(), F.hasMinSize());
+ // Disable fixed-point iteration in the Combiner. This improves compile-time
+ // at the cost of possibly missing optimizations. See PR#94291 for details.
+ CInfo.MaxIterations = 1;
+
AArch64O0PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB,
/*CSEInfo*/ nullptr, RuleConfig, ST);
return Impl.combineMachineInstrs();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 0c7be9f42c57..f71fe323a6d3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -524,8 +524,8 @@ void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<GISelCSEAnalysisWrapperPass>();
AU.addPreserved<GISelCSEAnalysisWrapperPass>();
}
@@ -557,7 +557,8 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ IsOptNone ? nullptr
+ : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
GISelCSEAnalysisWrapper &Wrapper =
getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 77b8cbe5793c..4a1977ba1a00 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -19,6 +19,7 @@
///
//===----------------------------------------------------------------------===//
+#include "AArch64ExpandImm.h"
#include "AArch64GlobalISelUtils.h"
#include "AArch64PerfectShuffle.h"
#include "AArch64Subtarget.h"
@@ -563,7 +564,8 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P,
auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, MRI);
if (!ValAndVReg)
return std::nullopt;
- uint64_t C = ValAndVReg->Value.getZExtValue();
+ uint64_t OriginalC = ValAndVReg->Value.getZExtValue();
+ uint64_t C = OriginalC;
if (isLegalArithImmed(C))
return std::nullopt;
@@ -633,9 +635,20 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P,
// predicate if it is.
if (Size == 32)
C = static_cast<uint32_t>(C);
- if (!isLegalArithImmed(C))
- return std::nullopt;
- return {{C, P}};
+ if (isLegalArithImmed(C))
+ return {{C, P}};
+
+ auto IsMaterializableInSingleInstruction = [=](uint64_t Imm) {
+ SmallVector<AArch64_IMM::ImmInsnModel> Insn;
+ AArch64_IMM::expandMOVImm(Imm, 32, Insn);
+ return Insn.size() == 1;
+ };
+
+ if (!IsMaterializableInSingleInstruction(OriginalC) &&
+ IsMaterializableInSingleInstruction(C))
+ return {{C, P}};
+
+ return std::nullopt;
}
/// Determine whether or not it is possible to update the RHS and predicate of
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 31f77be20f34..e9b25924b35f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -823,8 +823,8 @@ void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
getSelectionDAGFallbackAnalysisUsage(AU);
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<GISelCSEAnalysisWrapperPass>();
AU.addPreserved<GISelCSEAnalysisWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -856,7 +856,8 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
bool EnableOpt =
MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
- MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree *MDT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
/*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
F.hasMinSize());
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 4aa6999d1d3c..5616d063f70b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -163,17 +163,18 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(
unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min; \
(void)PartialMapDstIdx; \
(void)PartialMapSrcIdx; \
- const ValueMapping *Map = getCopyMapping( \
- AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size); \
+ const ValueMapping *Map = getCopyMapping(AArch64::RBNameDst##RegBankID, \
+ AArch64::RBNameSrc##RegBankID, \
+ TypeSize::getFixed(Size)); \
(void)Map; \
assert(Map[0].BreakDown == \
&AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \
- Map[0].NumBreakDowns == 1 && #RBNameDst #Size \
- " Dst is incorrectly initialized"); \
+ Map[0].NumBreakDowns == 1 && \
+ #RBNameDst #Size " Dst is incorrectly initialized"); \
assert(Map[1].BreakDown == \
&AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \
- Map[1].NumBreakDowns == 1 && #RBNameSrc #Size \
- " Src is incorrectly initialized"); \
+ Map[1].NumBreakDowns == 1 && \
+ #RBNameSrc #Size " Src is incorrectly initialized"); \
\
} while (false)
@@ -218,7 +219,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(
unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
const RegisterBank &B,
- TypeSize Size) const {
+ const TypeSize Size) const {
// What do we do with different size?
// copy are same size.
// Will introduce other hooks for different size:
@@ -258,6 +259,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
case AArch64::QQQRegClassID:
case AArch64::QQQQRegClassID:
case AArch64::ZPRRegClassID:
+ case AArch64::ZPR_3bRegClassID:
return getRegBank(AArch64::FPRRegBankID);
case AArch64::GPR32commonRegClassID:
case AArch64::GPR32RegClassID:
@@ -304,7 +306,7 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
case TargetOpcode::G_OR: {
// 32 and 64-bit or can be mapped on either FPR or
// GPR for the same cost.
- unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ TypeSize Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
if (Size != 32 && Size != 64)
break;
@@ -325,7 +327,7 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
return AltMappings;
}
case TargetOpcode::G_BITCAST: {
- unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ TypeSize Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
if (Size != 32 && Size != 64)
break;
@@ -365,7 +367,7 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
return AltMappings;
}
case TargetOpcode::G_LOAD: {
- unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ TypeSize Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
if (Size != 64)
break;
@@ -377,15 +379,17 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
InstructionMappings AltMappings;
const InstructionMapping &GPRMapping = getInstructionMapping(
/*ID*/ 1, /*Cost*/ 1,
- getOperandsMapping({getValueMapping(PMI_FirstGPR, Size),
- // Addresses are GPR 64-bit.
- getValueMapping(PMI_FirstGPR, 64)}),
+ getOperandsMapping(
+ {getValueMapping(PMI_FirstGPR, Size),
+ // Addresses are GPR 64-bit.
+ getValueMapping(PMI_FirstGPR, TypeSize::getFixed(64))}),
/*NumOperands*/ 2);
const InstructionMapping &FPRMapping = getInstructionMapping(
/*ID*/ 2, /*Cost*/ 1,
- getOperandsMapping({getValueMapping(PMI_FirstFPR, Size),
- // Addresses are GPR 64-bit.
- getValueMapping(PMI_FirstGPR, 64)}),
+ getOperandsMapping(
+ {getValueMapping(PMI_FirstFPR, Size),
+ // Addresses are GPR 64-bit.
+ getValueMapping(PMI_FirstGPR, TypeSize::getFixed(64))}),
/*NumOperands*/ 2);
AltMappings.push_back(&GPRMapping);
@@ -437,7 +441,7 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(
"This code is for instructions with 3 or less operands");
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- unsigned Size = Ty.getSizeInBits();
+ TypeSize Size = Ty.getSizeInBits();
bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR;
@@ -496,6 +500,20 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
}
}
+bool AArch64RegisterBankInfo::isPHIWithFPContraints(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI, const unsigned Depth) const {
+ if (!MI.isPHI() || Depth > MaxFPRSearchDepth)
+ return false;
+
+ return any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
+ [&](const MachineInstr &UseMI) {
+ if (onlyUsesFP(UseMI, MRI, TRI, Depth + 1))
+ return true;
+ return isPHIWithFPContraints(UseMI, MRI, TRI, Depth + 1);
+ });
+}
+
bool AArch64RegisterBankInfo::hasFPConstraints(const MachineInstr &MI,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
@@ -700,9 +718,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// If both RB are null that means both registers are generic.
// We shouldn't be here.
assert(DstRB && SrcRB && "Both RegBank were nullptr");
- unsigned Size = getSizeInBits(DstReg, MRI, TRI);
+ TypeSize Size = getSizeInBits(DstReg, MRI, TRI);
return getInstructionMapping(
- DefaultMappingID, copyCost(*DstRB, *SrcRB, TypeSize::getFixed(Size)),
+ DefaultMappingID, copyCost(*DstRB, *SrcRB, Size),
getCopyMapping(DstRB->getID(), SrcRB->getID(), Size),
// We only care about the mapping of the destination.
/*NumOperands*/ 1);
@@ -713,7 +731,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_BITCAST: {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
- unsigned Size = DstTy.getSizeInBits();
+ TypeSize Size = DstTy.getSizeInBits();
bool DstIsGPR = !DstTy.isVector() && DstTy.getSizeInBits() <= 64;
bool SrcIsGPR = !SrcTy.isVector() && SrcTy.getSizeInBits() <= 64;
const RegisterBank &DstRB =
@@ -721,7 +739,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const RegisterBank &SrcRB =
SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
return getInstructionMapping(
- DefaultMappingID, copyCost(DstRB, SrcRB, TypeSize::getFixed(Size)),
+ DefaultMappingID, copyCost(DstRB, SrcRB, Size),
getCopyMapping(DstRB.getID(), SrcRB.getID(), Size),
// We only care about the mapping of the destination for COPY.
/*NumOperands*/ Opc == TargetOpcode::G_BITCAST ? 2 : 1);
@@ -851,13 +869,18 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// instead of blind map every scalar to GPR.
if (any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
[&](const MachineInstr &UseMI) {
- // If we have at least one direct use in a FP instruction,
+ // If we have at least one direct or indirect use
+ // in a FP instruction,
// assume this was a floating point load in the IR. If it was
// not, we would have had a bitcast before reaching that
// instruction.
//
// Int->FP conversion operations are also captured in
// onlyDefinesFP().
+
+ if (isPHIWithFPContraints(UseMI, MRI, TRI))
+ return true;
+
return onlyUsesFP(UseMI, MRI, TRI) ||
onlyDefinesFP(UseMI, MRI, TRI);
}))
@@ -1107,7 +1130,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
LLT Ty = MRI.getType(MI.getOperand(Idx).getReg());
if (!Ty.isValid())
continue;
- auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+ auto Mapping =
+ getValueMapping(OpRegBankIdx[Idx], TypeSize::getFixed(OpSize[Idx]));
if (!Mapping->isValid())
return getInvalidInstructionMapping();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index b6364c6a6409..0d89f540650a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -70,7 +70,7 @@ protected:
PartialMappingIdx LastAlias,
ArrayRef<PartialMappingIdx> Order);
- static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size);
+ static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, TypeSize Size);
/// Get the pointer to the ValueMapping representing the RegisterBank
/// at \p RBIdx with a size of \p Size.
@@ -80,13 +80,13 @@ protected:
///
/// \pre \p RBIdx != PartialMappingIdx::None
static const RegisterBankInfo::ValueMapping *
- getValueMapping(PartialMappingIdx RBIdx, unsigned Size);
+ getValueMapping(PartialMappingIdx RBIdx, TypeSize Size);
/// Get the pointer to the ValueMapping of the operands of a copy
/// instruction from the \p SrcBankID register bank to the \p DstBankID
/// register bank with a size of \p Size.
static const RegisterBankInfo::ValueMapping *
- getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size);
+ getCopyMapping(unsigned DstBankID, unsigned SrcBankID, TypeSize Size);
/// Get the instruction mapping for G_FPEXT.
///
@@ -120,6 +120,13 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
/// Maximum recursion depth for hasFPConstraints.
const unsigned MaxFPRSearchDepth = 2;
+ /// \returns true if \p MI is a PHI that its def is used by
+ /// any instruction that onlyUsesFP.
+ bool isPHIWithFPContraints(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ unsigned Depth = 0) const;
+
/// \returns true if \p MI only uses and defines FPRs.
bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index f5bea3336cbf..7dba22c066dc 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -28,7 +28,7 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbolELF.h"
@@ -183,7 +183,7 @@ public:
std::move(Emitter)),
MappingSymbolCounter(0), LastEMS(EMS_None) {}
- void changeSection(MCSection *Section, const MCExpr *Subsection) override {
+ void changeSection(MCSection *Section, uint32_t Subsection = 0) override {
// We have to keep track of the mapping symbol state of any sections we
// use. Each one should start off as EMS_None, which is provided as the
// default constructor by DenseMap::lookup.
@@ -248,6 +248,7 @@ public:
emitDataMappingSymbol();
MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
}
+
private:
enum ElfMappingSymbol {
EMS_None,
@@ -283,7 +284,6 @@ private:
DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
ElfMappingSymbol LastEMS;
};
-
} // end anonymous namespace
AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
@@ -299,6 +299,37 @@ void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) {
cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS);
}
+void AArch64TargetELFStreamer::finish() {
+ AArch64TargetStreamer::finish();
+ AArch64ELFStreamer &S = getStreamer();
+ MCContext &Ctx = S.getContext();
+ auto &Asm = S.getAssembler();
+ MCSectionELF *MemtagSec = nullptr;
+ for (const MCSymbol &Symbol : Asm.symbols()) {
+ const auto &Sym = cast<MCSymbolELF>(Symbol);
+ if (Sym.isMemtag()) {
+ MemtagSec = Ctx.getELFSection(".memtag.globals.static",
+ ELF::SHT_AARCH64_MEMTAG_GLOBALS_STATIC, 0);
+ break;
+ }
+ }
+ if (!MemtagSec)
+ return;
+
+ // switchSection registers the section symbol and invalidates symbols(). We
+ // need a separate symbols() loop.
+ S.switchSection(MemtagSec);
+ const auto *Zero = MCConstantExpr::create(0, Ctx);
+ for (const MCSymbol &Symbol : Asm.symbols()) {
+ const auto &Sym = cast<MCSymbolELF>(Symbol);
+ if (!Sym.isMemtag())
+ continue;
+ auto *SRE = MCSymbolRefExpr::create(&Sym, MCSymbolRefExpr::VK_None, Ctx);
+ (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(),
+ *Ctx.getSubtargetInfo());
+ }
+}
+
MCTargetStreamer *
llvm::createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
MCInstPrinter *InstPrint,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index e8a9dc445b96..ac441ae3b603 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -93,6 +93,7 @@ private:
void emitInst(uint32_t Inst) override;
void emitDirectiveVariantPCS(MCSymbol *Symbol) override;
+ void finish() override;
public:
AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index b21b1faf5c96..3087f6090379 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -111,6 +111,12 @@ class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum
let usesCustomInserter = 1;
}
+class sme2_movez_to_tile_multi_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand vector_ty, SMEMatrixTypeEnum za_flag>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> {
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
@@ -4000,7 +4006,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
def _B : sme2_mova_tile_to_vec_vg2_multi_base<0b00, v, opc, ZZ_b_mul_r,
!if(v, TileVectorOpV8,
TileVectorOpH8),
- uimm3s2range, mnemonic> {
+ uimm3s2range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
bits<3> imm;
let Inst{7-5} = imm;
}
@@ -4008,7 +4014,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
def _H : sme2_mova_tile_to_vec_vg2_multi_base<0b01, v, opc, ZZ_h_mul_r,
!if(v, TileVectorOpV16,
TileVectorOpH16),
- uimm2s2range, mnemonic> {
+ uimm2s2range, mnemonic>, SMEPseudo2Instr<NAME # _H, 1> {
bits<1> ZAn;
bits<2> imm;
let Inst{7} = ZAn;
@@ -4018,7 +4024,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
def _S : sme2_mova_tile_to_vec_vg2_multi_base<0b10, v, opc, ZZ_s_mul_r,
!if(v, TileVectorOpV32,
TileVectorOpH32),
- uimm1s2range, mnemonic> {
+ uimm1s2range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
bits<2> ZAn;
bits<1> imm;
let Inst{7-6} = ZAn;
@@ -4028,7 +4034,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
def _D : sme2_mova_tile_to_vec_vg2_multi_base<0b11, v, opc, ZZ_d_mul_r,
!if(v, TileVectorOpV64,
TileVectorOpH64),
- uimm0s2range, mnemonic> {
+ uimm0s2range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
bits<3> ZAn;
let Inst{7-5} = ZAn;
}
@@ -4097,6 +4103,17 @@ multiclass sme2_mova_tile_to_vec_vg2_multi<string mnemonic>{
multiclass sme2p1_movaz_tile_to_vec_vg2<string mnemonic>{
defm _H : sme2_mova_tile_to_vec_vg2_multi_inst<0b0, 0b010, mnemonic>;
defm _V : sme2_mova_tile_to_vec_vg2_multi_inst<0b1, 0b010, mnemonic>;
+
+
+ def NAME # _H_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_B, sme_elm_idx0_0, uimm3s2range, ZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _H_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_H, sme_elm_idx0_1, uimm2s2range, ZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _H_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_S, sme_elm_idx0_3, uimm1s2range, ZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _H_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_D, sme_elm_idx0_7, uimm0s2range, ZZ_d_mul_r, SMEMatrixTileD>;
+
+ def NAME # _V_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_B, sme_elm_idx0_0, uimm3s2range, ZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _V_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_H, sme_elm_idx0_1, uimm2s2range, ZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _V_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_S, sme_elm_idx0_3, uimm1s2range, ZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _V_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_D, sme_elm_idx0_7, uimm0s2range, ZZ_d_mul_r, SMEMatrixTileD>;
}
class sme2_mova_tile_to_vec_vg4_multi_base<bits<2> sz, bit v, bits<6> op,
@@ -4130,7 +4147,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
ZZZZ_b_mul_r,
!if(v, TileVectorOpV8,
TileVectorOpH8),
- uimm2s4range, mnemonic> {
+ uimm2s4range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
bits<2> imm;
let Inst{6-5} = imm;
}
@@ -4139,7 +4156,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
ZZZZ_h_mul_r,
!if(v, TileVectorOpV16,
TileVectorOpH16),
- uimm1s4range, mnemonic> {
+ uimm1s4range, mnemonic>, SMEPseudo2Instr<NAME # _H, 1> {
bits<1> ZAn;
bits<1> imm;
let Inst{6} = ZAn;
@@ -4150,7 +4167,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
ZZZZ_s_mul_r,
!if(v, TileVectorOpV32,
TileVectorOpH32),
- uimm0s4range, mnemonic> {
+ uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
bits<2> ZAn;
let Inst{6-5} = ZAn;
}
@@ -4159,7 +4176,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
ZZZZ_d_mul_r,
!if(v, TileVectorOpV64,
TileVectorOpH64),
- uimm0s4range, mnemonic> {
+ uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
bits<3> ZAn;
let Inst{7-5} = ZAn;
}
@@ -4228,6 +4245,16 @@ multiclass sme2_mova_tile_to_vec_vg4_multi<string mnemonic>{
multiclass sme2p1_movaz_tile_to_vec_vg4<string mnemonic>{
defm _H : sme2_mova_tile_to_vec_vg4_multi_base<0b0, 0b110, mnemonic>;
defm _V : sme2_mova_tile_to_vec_vg4_multi_base<0b1, 0b110, mnemonic>;
+
+ def NAME # _H_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_B, sme_elm_idx0_0, uimm2s4range, ZZZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _H_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_H, sme_elm_idx0_1, uimm1s4range, ZZZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _H_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_S, sme_elm_idx0_3, uimm0s4range, ZZZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _H_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_D, sme_elm_idx0_7, uimm0s4range, ZZZZ_d_mul_r, SMEMatrixTileD>;
+
+ def NAME # _V_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_B, sme_elm_idx0_0, uimm2s4range, ZZZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _V_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_H, sme_elm_idx0_1, uimm1s4range, ZZZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _V_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_S, sme_elm_idx0_3, uimm0s4range, ZZZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _V_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_D, sme_elm_idx0_7, uimm0s4range, ZZZZ_d_mul_r, SMEMatrixTileD>;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d0d7a9dc1724..63d83346528a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -351,6 +351,7 @@ def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
"GFX90AInsts",
"true",
"Additional instructions for GFX90A+"
+ // [HasAtomicFMinFMaxF64GlobalInsts, HasAtomicFMinFMaxF64FlatInsts] // TODO
>;
def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
@@ -711,6 +712,30 @@ def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts",
[FeatureFlatGlobalInsts]
>;
+def FeatureAtomicFMinFMaxF32GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f32",
+ "HasAtomicFMinFMaxF32GlobalInsts",
+ "true",
+ "Has global/buffer instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f64",
+ "HasAtomicFMinFMaxF64GlobalInsts",
+ "true",
+ "Has global/buffer instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32",
+ "HasAtomicFMinFMaxF32FlatInsts",
+ "true",
+ "Has flat memory instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64",
+ "HasAtomicFMinFMaxF64FlatInsts",
+ "true",
+ "Has flat memory instructions for atomicrmw fmin/fmax for double"
+>;
+
def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
"HasAtomicFaddNoRtnInsts",
"true",
@@ -743,6 +768,12 @@ def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf
[FeatureFlatGlobalInsts]
>;
+def FeatureAtomicBufferPkAddBF16Inst : SubtargetFeature<"atomic-buffer-pk-add-bf16-inst",
+ "HasAtomicBufferPkAddBF16Inst",
+ "true",
+ "Has buffer_atomic_pk_add_bf16 instruction"
+>;
+
def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts",
"HasAtomicCSubNoRtnInsts",
"true",
@@ -1061,7 +1092,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
- FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+ FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts
]
>;
@@ -1072,7 +1104,9 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
- FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+ FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
]
>;
@@ -1127,7 +1161,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
- FeatureMaxHardClauseLength63
+ FeatureMaxHardClauseLength63,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
]
>;
@@ -1148,7 +1184,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
FeatureGWS, FeatureDefaultComponentZero,
- FeatureMaxHardClauseLength32
+ FeatureMaxHardClauseLength32,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
]
>;
@@ -1169,7 +1206,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
- FeatureMaxHardClauseLength32
+ FeatureMaxHardClauseLength32,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
]
>;
@@ -1332,7 +1370,10 @@ def FeatureISAVersion9_0_A : FeatureSet<
FeaturePackedTID,
FullRate64Ops,
FeatureBackOffBarrier,
- FeatureKernargPreload])>;
+ FeatureKernargPreload,
+ FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF64FlatInsts
+ ])>;
def FeatureISAVersion9_0_C : FeatureSet<
!listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
@@ -1372,7 +1413,10 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureArchitectedFlatScratch,
FullRate64Ops,
FeatureBackOffBarrier,
- FeatureKernargPreload]>;
+ FeatureKernargPreload,
+ FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF64FlatInsts
+ ]>;
def FeatureISAVersion9_4_0 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
@@ -1561,6 +1605,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureAtomicFlatPkAdd16Insts,
FeatureAtomicBufferGlobalPkAddF16Insts,
FeatureAtomicGlobalPkAddBF16Inst,
+ FeatureAtomicBufferPkAddBF16Inst,
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeatureExtendedImageInsts,
@@ -1572,7 +1617,9 @@ def FeatureISAVersion12 : FeatureSet<
FeatureHasRestrictedSOffset,
FeatureVGPRSingleUseHintInsts,
FeatureScalarDwordx3Loads,
- FeatureDPPSrc1SGPR]>;
+ FeatureDPPSrc1SGPR,
+ FeatureMaxHardClauseLength32,
+ Feature1_5xVGPRs]>;
def FeatureISAVersion12_Generic: FeatureSet<
!listconcat(FeatureISAVersion12.Features,
@@ -1862,9 +1909,28 @@ def isGFX12Plus :
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
-def HasBufferFlatGlobalAtomicsF64 :
+
+def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
- AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
+ // FIXME: This is too coarse, and working around using pseudo's predicates on real instruction.
+ AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, FeatureSouthernIslands, FeatureSeaIslands)>;
+
+def HasAtomicFMinFMaxF32GlobalInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32GlobalInsts)>;
+
+def HasAtomicFMinFMaxF64GlobalInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF64GlobalInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64GlobalInsts)>;
+
+def HasAtomicFMinFMaxF32FlatInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF32FlatInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32FlatInsts)>;
+
+def HasAtomicFMinFMaxF64FlatInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>;
+
def HasLdsAtomicAddF64 :
Predicate<"Subtarget->hasLdsAtomicAddF64()">,
AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
@@ -2118,7 +2184,10 @@ def HasAtomicBufferGlobalPkAddF16Insts
AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>;
def HasAtomicGlobalPkAddBF16Inst
: Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">,
- AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
+ AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
+def HasAtomicBufferPkAddBF16Inst
+ : Predicate<"Subtarget->hasAtomicBufferPkAddBF16Inst()">,
+ AssemblerPredicate<(all_of FeatureAtomicBufferPkAddBF16Inst)>;
def HasFlatAtomicFaddF32Inst
: Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">,
AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index de25f9241a50..f57fc168c1df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -115,6 +115,9 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
return std::tuple(
PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
&AMDGPU::SGPR_32RegClass, LLT::scalar(32));
+ case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_SIZE:
+ return {PrivateSegmentSize ? &PrivateSegmentSize : nullptr,
+ &AMDGPU::SGPR_32RegClass, LLT::scalar(32)};
case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR:
return std::tuple(KernargSegmentPtr ? &KernargSegmentPtr : nullptr,
&AMDGPU::SGPR_64RegClass,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 42b33c50d9f8..2e02bb4271ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -114,11 +114,12 @@ struct AMDGPUFunctionArgInfo {
PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
IMPLICIT_BUFFER_PTR = 15,
IMPLICIT_ARG_PTR = 16,
+ PRIVATE_SEGMENT_SIZE = 17,
// VGPRS:
- WORKITEM_ID_X = 17,
- WORKITEM_ID_Y = 18,
- WORKITEM_ID_Z = 19,
+ WORKITEM_ID_X = 18,
+ WORKITEM_ID_Y = 19,
+ WORKITEM_ID_Z = 20,
FIRST_VGPR_VALUE = WORKITEM_ID_X
};
// clang-format on
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index cad4a3430327..e49925f86bd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -29,6 +29,7 @@
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
+#include "Utils/SIDefinesUtils.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -135,15 +136,6 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
getTargetStreamer()->getPALMetadata()->readFromIR(M);
}
-uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) {
- int64_t Val;
- if (!Value->evaluateAsAbsolute(Val)) {
- Ctx.reportError(SMLoc(), "could not resolve expression when required.");
- return 0;
- }
- return static_cast<uint64_t>(Val);
-}
-
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
// Init target streamer if it has not yet happened
if (!IsTargetStreamerInitialized)
@@ -248,14 +240,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
- getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context),
- getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) -
- IsaInfo::getNumExtraSGPRs(
- &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
- getMCExprValue(CurrentProgramInfo.FlatUsed, Context),
- getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
- getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
- getMCExprValue(CurrentProgramInfo.FlatUsed, Context));
+ CurrentProgramInfo.NumVGPRsForWavesPerEU,
+ MCBinaryExpr::createSub(
+ CurrentProgramInfo.NumSGPRsForWavesPerEU,
+ AMDGPUMCExpr::createExtraSGPRs(
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+ getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
+ Context),
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
Streamer.popSection();
}
@@ -400,9 +392,40 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
false);
}
-uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
+SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
+ SmallString<128> Str;
+ raw_svector_ostream OSS(Str);
+ int64_t IVal;
+ if (Value->evaluateAsAbsolute(IVal)) {
+ OSS << static_cast<uint64_t>(IVal);
+ } else {
+ Value->print(OSS, MAI);
+ }
+ return Str;
+}
+
+void AMDGPUAsmPrinter::emitCommonFunctionComments(
+ const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
+ const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
+ const AMDGPUMachineFunction *MFI) {
+ OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
+ OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);
+ OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
+ if (NumAGPR && TotalNumVGPR) {
+ OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
+ OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
+ false);
+ }
+ OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
+ false);
+ OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
+ false);
+}
+
+const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ MCContext &Ctx = MF.getContext();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
@@ -430,16 +453,28 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
}
+ if (UserSGPRInfo.hasPrivateSegmentSize()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+ }
if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
- if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) &&
- CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
- KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
-
- return KernelCodeProperties;
+ // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
+ // un-evaluatable at this point so it cannot be conditionally checked here.
+ // Instead, we'll directly shift the possibly unknown MCExpr into its place
+ // and bitwise-or it into KernelCodeProperties.
+ const MCExpr *KernelCodePropExpr =
+ MCConstantExpr::create(KernelCodeProperties, Ctx);
+ const MCExpr *OrValue = MCConstantExpr::create(
+ amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
+ OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
+ OrValue, Ctx);
+ KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
+
+ return KernelCodePropExpr;
}
MCKernelDescriptor
@@ -462,11 +497,15 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
- KernelDescriptor.kernel_code_properties =
- MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
-
- assert(STM.hasGFX90AInsts() ||
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
+ KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+
+ int64_t PGRM_Rsrc3 = 1;
+ bool EvaluatableRsrc3 =
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
+ (void)PGRM_Rsrc3;
+ (void)EvaluatableRsrc3;
+ assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
+ static_cast<uint64_t>(PGRM_Rsrc3) == 0);
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
@@ -554,13 +593,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(" Kernel info:", false);
emitCommonFunctionComments(
- getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx),
- STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)
- : std::optional<uint32_t>(),
- getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx),
- getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx),
- getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx),
- getFunctionCodeSize(MF), MFI);
+ CurrentProgramInfo.NumArchVGPR,
+ STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
+ CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
+ CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -571,43 +607,38 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" bytes/workgroup (compile time only)", false);
OutStreamer->emitRawComment(
- " SGPRBlocks: " +
- Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)),
- false);
+ " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
+
OutStreamer->emitRawComment(
- " VGPRBlocks: " +
- Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)),
- false);
+ " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
OutStreamer->emitRawComment(
" NumSGPRsForWavesPerEU: " +
- Twine(
- getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)),
+ getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
false);
OutStreamer->emitRawComment(
" NumVGPRsForWavesPerEU: " +
- Twine(
- getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)),
+ getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
false);
- if (STM.hasGFX90AInsts())
+ if (STM.hasGFX90AInsts()) {
+ const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
+ CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
+ AdjustedAccum = MCBinaryExpr::createMul(
+ AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
OutStreamer->emitRawComment(
- " AccumOffset: " +
- Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) *
- 4),
- false);
+ " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
+ }
OutStreamer->emitRawComment(
- " Occupancy: " +
- Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)),
- false);
+ " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
- Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)),
+ getMCExprStr(CurrentProgramInfo.ScratchEnable),
false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(CurrentProgramInfo.UserSGPR),
@@ -628,20 +659,25 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Twine(CurrentProgramInfo.TIdIGCompCount),
false);
+ [[maybe_unused]] int64_t PGMRSrc3;
assert(STM.hasGFX90AInsts() ||
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
+ (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
+ PGMRSrc3) &&
+ static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
- Twine((AMDHSA_BITS_GET(
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+ getMCExprStr(MCKernelDescriptor::bits_get(
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
- Twine((AMDHSA_BITS_GET(
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+ getMCExprStr(MCKernelDescriptor::bits_get(
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
false);
}
}
@@ -765,7 +801,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// The calculations related to SGPR/VGPR blocks are
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
- const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs(
+ const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
ProgInfo.VCCUsed, ProgInfo.FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
@@ -858,27 +894,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
}
}
- ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax(
+ ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
- ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax(
+ ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
- ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR(
+ ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
// number of waves per execution unit request.
unsigned MaxWaves = MFI->getMaxWavesPerEU();
- ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
- {ProgInfo.NumSGPR, CreateExpr(1ul),
- CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
- Ctx);
- ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
- {ProgInfo.NumVGPR, CreateExpr(1ul),
- CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
- Ctx);
+ ProgInfo.NumSGPRsForWavesPerEU =
+ AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
+ CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
+ Ctx);
+ ProgInfo.NumVGPRsForWavesPerEU =
+ AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
+ CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
+ Ctx);
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
STM.hasSGPRInitBug()) {
@@ -927,10 +963,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
unsigned Granule) {
const MCExpr *OneConst = CreateExpr(1ul);
const MCExpr *GranuleConst = CreateExpr(Granule);
- const MCExpr *MaxNumGPR =
- AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx);
+ const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
const MCExpr *AlignToGPR =
- AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+ AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
const MCExpr *DivGPR =
MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
@@ -972,7 +1007,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// The MCExpr equivalent of divideCeil.
auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
const MCExpr *Ceil =
- AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx);
+ AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
};
@@ -1045,7 +1080,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
- ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy(
+ ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
@@ -1207,41 +1242,49 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
auto &Ctx = MF.getContext();
MD->setEntryPoint(CC, MF.getFunction().getName());
- MD->setNumUsedVgprs(
- CC, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
+ MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
// Only set AGPRs for supported devices
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.hasMAIInsts()) {
- MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx));
+ MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
}
- MD->setNumUsedSgprs(
- CC, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
+ MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
if (MD->getPALMajorVersion() < 3) {
- MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
+ MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
if (AMDGPU::isCompute(CC)) {
- MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
+ MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else {
- if (getMCExprValue(CurrentProgramInfo.ScratchBlocks, Ctx) > 0)
- MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
+ const MCExpr *HasScratchBlocks =
+ MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
+ MCConstantExpr::create(0, Ctx), Ctx);
+ auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
+ MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
}
} else {
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
- MD->setHwStage(CC, ".scratch_en",
- (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx));
+ MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
+ CurrentProgramInfo.ScratchEnable);
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
}
// ScratchSize is in bytes, 16 aligned.
MD->setScratchSize(
- CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), 16));
+ CC,
+ AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
+ MCConstantExpr::create(16, Ctx), Ctx),
+ Ctx);
+
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
: CurrentProgramInfo.LDSBlocks;
if (MD->getPALMajorVersion() < 3) {
- MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
+ MD->setRsrc2(
+ CC,
+ MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
+ Ctx);
MD->setSpiPsInputEna(MFI->getPSInputEnable());
MD->setSpiPsInputAddr(MFI->getPSInputAddr());
} else {
@@ -1288,20 +1331,19 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
if (MD->getPALMajorVersion() < 3) {
// Set compute registers
- MD->setRsrc1(CallingConv::AMDGPU_CS,
- CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
+ MD->setRsrc1(
+ CallingConv::AMDGPU_CS,
+ CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
MD->setRsrc2(CallingConv::AMDGPU_CS,
- CurrentProgramInfo.getComputePGMRSrc2());
+ CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else {
EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
}
// Set optional info
MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
- MD->setFunctionNumUsedVgprs(
- FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
- MD->setFunctionNumUsedSgprs(
- FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
+ MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+ MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
}
// This is supposed to be log2(Size)
@@ -1362,6 +1404,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
if (UserSGPRInfo.hasFlatScratchInit())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+ if (UserSGPRInfo.hasPrivateSegmentSize())
+ Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+
if (UserSGPRInfo.hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
@@ -1463,28 +1508,26 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
// remarks to simulate newlines. If and when clang does accept newlines, this
// formatting should be aggregated into one remark with newlines to avoid
// printing multiple diagnostic location and diag opts.
- MCContext &MCCtx = MF.getContext();
EmitResourceUsageRemark("FunctionName", "Function Name",
MF.getFunction().getName());
EmitResourceUsageRemark("NumSGPR", "SGPRs",
- getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx));
- EmitResourceUsageRemark(
- "NumVGPR", "VGPRs",
- getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx));
+ getMCExprStr(CurrentProgramInfo.NumSGPR));
+ EmitResourceUsageRemark("NumVGPR", "VGPRs",
+ getMCExprStr(CurrentProgramInfo.NumArchVGPR));
if (hasMAIInsts) {
- EmitResourceUsageRemark(
- "NumAGPR", "AGPRs",
- getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx));
+ EmitResourceUsageRemark("NumAGPR", "AGPRs",
+ getMCExprStr(CurrentProgramInfo.NumAccVGPR));
}
- EmitResourceUsageRemark(
- "ScratchSize", "ScratchSize [bytes/lane]",
- getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx));
+ EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
+ getMCExprStr(CurrentProgramInfo.ScratchSize));
+ int64_t DynStack;
+ bool DynStackEvaluatable =
+ CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
StringRef DynamicStackStr =
- getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True"
- : "False";
+ DynStackEvaluatable && DynStack ? "True" : "False";
EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
- getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx));
+ getMCExprStr(CurrentProgramInfo.Occupancy));
EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
CurrentProgramInfo.SGPRSpill);
EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 87156f27fc6c..f70a60aef007 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -65,12 +65,16 @@ private:
uint32_t TotalNumVGPR, uint32_t NumSGPR,
uint64_t ScratchSize, uint64_t CodeSize,
const AMDGPUMachineFunction *MFI);
+ void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR,
+ const MCExpr *TotalNumVGPR,
+ const MCExpr *NumSGPR,
+ const MCExpr *ScratchSize, uint64_t CodeSize,
+ const AMDGPUMachineFunction *MFI);
void emitResourceUsageRemarks(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo,
bool isModuleEntryFunction, bool hasMAIInsts);
- uint16_t getAmdhsaKernelCodeProperties(
- const MachineFunction &MF) const;
+ const MCExpr *getAmdhsaKernelCodeProperties(const MachineFunction &MF) const;
AMDGPU::MCKernelDescriptor
getAmdhsaKernelDescriptor(const MachineFunction &MF,
@@ -78,7 +82,7 @@ private:
void initTargetStreamer(Module &M);
- static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx);
+ SmallString<128> getMCExprStr(const MCExpr *Value);
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 1d645002b1fe..d7ef6f3c5dc4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -249,63 +249,54 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
default:
return;
- case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
Op = AtomicRMWInst::Add;
break;
- case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
Op = AtomicRMWInst::Sub;
break;
- case Intrinsic::amdgcn_buffer_atomic_and:
case Intrinsic::amdgcn_struct_buffer_atomic_and:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
case Intrinsic::amdgcn_raw_buffer_atomic_and:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
Op = AtomicRMWInst::And;
break;
- case Intrinsic::amdgcn_buffer_atomic_or:
case Intrinsic::amdgcn_struct_buffer_atomic_or:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
case Intrinsic::amdgcn_raw_buffer_atomic_or:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
Op = AtomicRMWInst::Or;
break;
- case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
Op = AtomicRMWInst::Xor;
break;
- case Intrinsic::amdgcn_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
Op = AtomicRMWInst::Min;
break;
- case Intrinsic::amdgcn_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
Op = AtomicRMWInst::UMin;
break;
- case Intrinsic::amdgcn_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
Op = AtomicRMWInst::Max;
break;
- case Intrinsic::amdgcn_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
@@ -413,7 +404,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *Permlanex16Call = B.CreateIntrinsic(
- Intrinsic::amdgcn_permlanex16, {},
+ V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlanex16Call, AtomicTy));
@@ -425,7 +416,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce across the upper and lower 32 lanes.
V = B.CreateBitCast(V, IntNTy);
Value *Permlane64Call =
- B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
+ B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlane64Call, AtomicTy));
}
@@ -433,7 +424,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
V = B.CreateBitCast(V, IntNTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -481,7 +472,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *PermX = B.CreateIntrinsic(
- Intrinsic::amdgcn_permlanex16, {},
+ V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
Value *UpdateDPPCall =
@@ -523,10 +514,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
- Function *WriteLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+ Function *ReadLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
+ Function *WriteLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 231db188e65d..537d3a43aa9f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -271,11 +271,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, atomic_load_fmax_glue>;
-
+def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
+def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;
@@ -290,7 +287,6 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
-def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index a0c6bf7cc31c..fb258547e8fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -46,8 +46,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -192,8 +192,8 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
"AMDGPU GlobalISel divergence lowering", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
"AMDGPU GlobalISel divergence lowering", false, false)
@@ -209,8 +209,10 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
MachineFunction &MF) {
- MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
- MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
+ MachineDominatorTree &DT =
+ getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ MachinePostDominatorTree &PDT =
+ getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
MachineUniformityInfo &MUI =
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 7ab9ba285133..efe47b2c3eed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -464,16 +464,6 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
- auto GetMCExprValue = [&MF](const MCExpr *Value) {
- int64_t Val;
- if (!Value->evaluateAsAbsolute(Val)) {
- MCContext &Ctx = MF.getContext();
- Ctx.reportError(SMLoc(), "could not resolve expression when required.");
- Val = 0;
- }
- return static_cast<uint64_t>(Val);
- };
-
auto Kern = HSAMetadataDoc->getMapNode();
Align MaxKernArgAlign;
@@ -481,11 +471,12 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
STM.getKernArgSegmentSize(F, MaxKernArgAlign));
Kern[".group_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.LDSSize);
- Kern[".private_segment_fixed_size"] =
- Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.ScratchSize));
+ DelayedExprs->assignDocNode(Kern[".private_segment_fixed_size"],
+ msgpack::Type::UInt, ProgramInfo.ScratchSize);
if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) {
- Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode(
- static_cast<bool>(GetMCExprValue(ProgramInfo.DynamicCallStack)));
+ DelayedExprs->assignDocNode(Kern[".uses_dynamic_stack"],
+ msgpack::Type::Boolean,
+ ProgramInfo.DynamicCallStack);
}
if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
@@ -497,15 +488,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
Kern[".wavefront_size"] =
Kern.getDocument()->getNode(STM.getWavefrontSize());
- Kern[".sgpr_count"] =
- Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumSGPR));
- Kern[".vgpr_count"] =
- Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumVGPR));
+ DelayedExprs->assignDocNode(Kern[".sgpr_count"], msgpack::Type::UInt,
+ ProgramInfo.NumSGPR);
+ DelayedExprs->assignDocNode(Kern[".vgpr_count"], msgpack::Type::UInt,
+ ProgramInfo.NumVGPR);
// Only add AGPR count to metadata for supported devices
if (STM.hasMAIInsts()) {
- Kern[".agpr_count"] =
- Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumAccVGPR));
+ DelayedExprs->assignDocNode(Kern[".agpr_count"], msgpack::Type::UInt,
+ ProgramInfo.NumAccVGPR);
}
Kern[".max_flat_workgroup_size"] =
@@ -527,6 +518,7 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
}
bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+ DelayedExprs->resolveDelayedExpressions();
return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
}
@@ -536,9 +528,11 @@ void MetadataStreamerMsgPackV4::begin(const Module &Mod,
emitTargetID(TargetID);
emitPrintf(Mod);
getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
+ DelayedExprs->clear();
}
void MetadataStreamerMsgPackV4::end() {
+ DelayedExprs->resolveDelayedExpressions();
std::string HSAMetadataString;
raw_string_ostream StrOS(HSAMetadataString);
HSAMetadataDoc->toYAML(StrOS);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 0e3bc63919f0..fd76666dc360 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
+#include "Utils/AMDGPUDelayedMCExpr.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/Alignment.h"
@@ -65,6 +66,9 @@ protected:
class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4
: public MetadataStreamer {
protected:
+ std::unique_ptr<DelayedMCExprs> DelayedExprs =
+ std::make_unique<DelayedMCExprs>();
+
std::unique_ptr<msgpack::Document> HSAMetadataDoc =
std::make_unique<msgpack::Document>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 57769fe998d1..86f28a505769 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -1482,9 +1482,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
MFMAChains = 0;
for (auto &MFMAPipeSU : MFMAPipeSUs) {
- if (MFMAChainSeeds.size() &&
- std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) !=
- MFMAChainSeeds.end())
+ if (is_contained(MFMAChainSeeds, MFMAPipeSU))
continue;
if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
[&TII](SDep &Succ) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b50c0cc12626..6d5ffc66d98b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -502,9 +502,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
// isa<MemSDNode> almost works but is slightly too permissive for some DS
// intrinsics.
- if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) {
+ if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
N = glueCopyToM0LDSInit(N);
SelectCode(N);
return;
@@ -2006,12 +2004,31 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return true;
}
+// For unbuffered smem loads, it is illegal for the Immediate Offset to be
+// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
+// Handle the case where the Immediate Offset + SOffset is negative.
+bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
+ bool Imm32Only,
+ bool IsBuffer,
+ int64_t ImmOffset) const {
+ if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
+ AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
+ KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
+ if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
+ return false;
+ }
+
+ return true;
+}
+
// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
// not null) offset. If Imm32Only is true, match only 32-bit immediate
// offsets available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
SDValue *SOffset, SDValue *Offset,
- bool Imm32Only, bool IsBuffer) const {
+ bool Imm32Only, bool IsBuffer,
+ bool HasSOffset,
+ int64_t ImmOffset) const {
assert((!SOffset || !Offset) &&
"Cannot match both soffset and offset at the same time!");
@@ -2019,15 +2036,18 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
if (!C) {
if (!SOffset)
return false;
+
if (ByteOffsetNode.getValueType().isScalarInteger() &&
ByteOffsetNode.getValueType().getSizeInBits() == 32) {
*SOffset = ByteOffsetNode;
- return true;
+ return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
+ ImmOffset);
}
if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
*SOffset = ByteOffsetNode.getOperand(0);
- return true;
+ return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
+ ImmOffset);
}
}
return false;
@@ -2038,8 +2058,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
// GFX9 and GFX10 have signed byte immediate offsets. The immediate
// offset for S_BUFFER instructions is unsigned.
int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
- std::optional<int64_t> EncodedOffset =
- AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
+ std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
+ *Subtarget, ByteOffset, IsBuffer, HasSOffset);
if (EncodedOffset && Offset && !Imm32Only) {
*Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
return true;
@@ -2098,13 +2118,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
// true, match only 32-bit immediate offsets available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
SDValue *SOffset, SDValue *Offset,
- bool Imm32Only,
- bool IsBuffer) const {
+ bool Imm32Only, bool IsBuffer,
+ bool HasSOffset,
+ int64_t ImmOffset) const {
if (SOffset && Offset) {
assert(!Imm32Only && !IsBuffer);
SDValue B;
- return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
- SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
+
+ if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+ return false;
+
+ int64_t ImmOff = 0;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
+ ImmOff = C->getSExtValue();
+
+ return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
+ ImmOff);
}
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2123,11 +2152,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
}
if (!N0 || !N1)
return false;
- if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
+
+ if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset)) {
SBase = N0;
return true;
}
- if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
+ if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset)) {
SBase = N1;
return true;
}
@@ -2551,14 +2583,6 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
-void AMDGPUDAGToDAGISel::SelectPOPSExitingWaveID(SDNode *N) {
- // TODO: Select this with a tablegen pattern. This is tricky because the
- // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
- // mayLoad/mayStore and tablegen complains about the mismatch.
- SDValue Reg = CurDAG->getRegister(AMDGPU::SRC_POPS_EXITING_WAVE_ID, MVT::i32);
- CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, N->getVTList(), Reg);
-}
-
static unsigned gwsIntrinToOpcode(unsigned IntrID) {
switch (IntrID) {
case Intrinsic::amdgcn_ds_gws_init:
@@ -2715,9 +2739,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
SelectDSBvhStackIntrinsic(N);
return;
- case Intrinsic::amdgcn_pops_exiting_wave_id:
- SelectPOPSExitingWaveID(N);
- return;
}
SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 8e5662a3cd81..e7911bc1793d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -24,10 +24,6 @@ using namespace llvm;
namespace {
-static inline bool isNullConstantOrUndef(SDValue V) {
- return V.isUndef() || isNullConstant(V);
-}
-
static inline bool getConstantValue(SDValue N, uint32_t &Out) {
// This is only used for packed vectors, where using 0 for undef should
// always be good.
@@ -136,6 +132,8 @@ private:
bool isFlatScratchBaseLegal(SDValue Addr) const;
bool isFlatScratchBaseLegalSV(SDValue Addr) const;
bool isFlatScratchBaseLegalSVImm(SDValue Addr) const;
+ bool isSOffsetLegalWithImmOffset(SDValue *SOffset, bool Imm32Only,
+ bool IsBuffer, int64_t ImmOffset = 0) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@@ -178,11 +176,13 @@ private:
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
- bool IsBuffer = false) const;
+ bool IsBuffer = false, bool HasSOffset = false,
+ int64_t ImmOffset = 0) const;
SDValue Expand32BitAddress(SDValue Addr) const;
bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
- bool IsBuffer = false) const;
+ bool IsBuffer = false, bool HasSOffset = false,
+ int64_t ImmOffset = 0) const;
bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -194,6 +194,8 @@ private:
bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
SDValue &Offset) const;
+ bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const;
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
@@ -267,7 +269,6 @@ private:
void SelectFP_EXTEND(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDSBvhStackIntrinsic(SDNode *N);
- void SelectPOPSExitingWaveID(SDNode *N);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectInterpP1F16(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 375643b7f519..522b3a34161c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -42,8 +42,10 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
if (StoreSize <= 32)
return EVT::getIntegerVT(Ctx, StoreSize);
- assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
- return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+ if (StoreSize % 32 == 0)
+ return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+
+ return VT;
}
unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
@@ -5522,8 +5524,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(DS_ORDERED_COUNT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
- NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
- NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
@@ -5562,7 +5562,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
- NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 71c4334029b4..37572af3897f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -575,8 +575,6 @@ enum NodeType : unsigned {
TBUFFER_LOAD_FORMAT_D16,
DS_ORDERED_COUNT,
ATOMIC_CMP_SWAP,
- ATOMIC_LOAD_FMIN,
- ATOMIC_LOAD_FMAX,
BUFFER_LOAD,
BUFFER_LOAD_UBYTE,
BUFFER_LOAD_USHORT,
@@ -615,7 +613,6 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_CSUB,
BUFFER_ATOMIC_FADD,
- BUFFER_ATOMIC_FADD_BF16,
BUFFER_ATOMIC_FMIN,
BUFFER_ATOMIC_FMAX,
BUFFER_ATOMIC_COND_SUB_U32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index b78952ca3a62..43b3bf43fe56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUGenSearchableTables.inc"
#include "GCNSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
@@ -214,12 +215,14 @@ public:
RegisterUseCount[Unit]++;
// Do not attempt to optimise across exec mask changes.
- if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+ if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
+ AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) {
for (auto &UsedReg : RegisterUseCount)
UsedReg.second = 2;
}
- if (!SIInstrInfo::isVALU(MI))
+ if (!SIInstrInfo::isVALU(MI) ||
+ AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode()))
continue;
if (AllProducerOperandsAreSingleUse) {
SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 160a17584ca3..93bca4402ed2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1158,12 +1158,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
break;
}
- case Intrinsic::amdgcn_buffer_store_format:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_struct_buffer_store_format:
case Intrinsic::amdgcn_raw_tbuffer_store:
case Intrinsic::amdgcn_struct_tbuffer_store:
- case Intrinsic::amdgcn_tbuffer_store:
case Intrinsic::amdgcn_image_store_1d:
case Intrinsic::amdgcn_image_store_1darray:
case Intrinsic::amdgcn_image_store_2d:
@@ -1376,8 +1374,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const {
switch (II.getIntrinsicID()) {
- case Intrinsic::amdgcn_buffer_load:
- case Intrinsic::amdgcn_buffer_load_format:
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
@@ -1391,7 +1387,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
case Intrinsic::amdgcn_struct_tbuffer_load:
case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
- case Intrinsic::amdgcn_tbuffer_load:
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
default: {
if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ae3f2b87f353..a3cb3b3f47e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2079,21 +2079,6 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
-bool AMDGPUInstructionSelector::selectPOPSExitingWaveID(
- MachineInstr &MI) const {
- Register Dst = MI.getOperand(0).getReg();
- const DebugLoc &DL = MI.getDebugLoc();
- MachineBasicBlock *MBB = MI.getParent();
-
- // TODO: Select this with a tablegen pattern. This is tricky because the
- // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
- // mayLoad/mayStore and tablegen complains about the mismatch.
- auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
- .addReg(AMDGPU::SRC_POPS_EXITING_WAVE_ID);
- MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
-}
-
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
MachineInstr &I) const {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
@@ -2144,8 +2129,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectSBarrierSignalIsfirst(I, IntrinsicID);
case Intrinsic::amdgcn_s_barrier_leave:
return selectSBarrierLeave(I);
- case Intrinsic::amdgcn_pops_exiting_wave_id:
- return selectPOPSExitingWaveID(I);
}
return selectImpl(I, *CoverageInfo);
}
@@ -3620,8 +3603,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
case TargetOpcode::G_ATOMICRMW_FADD:
- case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
- case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
+ case TargetOpcode::G_ATOMICRMW_FMIN:
+ case TargetOpcode::G_ATOMICRMW_FMAX:
return selectG_LOAD_STORE_ATOMICRMW(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
@@ -4216,10 +4199,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
return false;
const GEPInfo &GEPI = AddrInfo[0];
- std::optional<int64_t> EncodedImm =
- AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
+ std::optional<int64_t> EncodedImm;
if (SOffset && Offset) {
+ EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
+ /*HasSOffset=*/true);
if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
AddrInfo.size() > 1) {
const GEPInfo &GEPI2 = AddrInfo[1];
@@ -4229,6 +4213,17 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
Base = GEPI2.SgprParts[0];
*SOffset = OffsetReg;
*Offset = *EncodedImm;
+ if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
+ return true;
+
+ // For unbuffered smem loads, it is illegal for the Immediate Offset
+ // to be negative if the resulting (Offset + (M0 or SOffset or zero)
+ // is negative. Handle the case where the Immediate Offset + SOffset
+ // is negative.
+ auto SKnown = KB->getKnownBits(*SOffset);
+ if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
+ return false;
+
return true;
}
}
@@ -4236,6 +4231,8 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
return false;
}
+ EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
+ /*HasSOffset=*/false);
if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
Base = GEPI.SgprParts[0];
*Offset = *EncodedImm;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 48f3b1811801..f561d5d29efc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -125,7 +125,6 @@ private:
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
- bool selectPOPSExitingWaveID(MachineInstr &MI) const;
bool selectImageIntrinsic(MachineInstr &MI,
const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index fa7492ac6cbe..c6dbc58395e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -140,7 +140,9 @@ class ImmOperand<ValueType type, string name = NAME, bit optional = 0,
let PrintMethod = printer;
}
-def s16imm : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">;
+class S16ImmOperand : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">;
+
+def s16imm : S16ImmOperand;
def u16imm : ImmOperand<i16, "U16Imm", 0, "printU16ImmOperand">;
class ValuePredicatedOperand<CustomOperand op, string valuePredicate,
@@ -616,6 +618,7 @@ multiclass local_addr_space_atomic_op {
}
}
+defm int_amdgcn_flat_atomic_fadd : noret_op;
defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op;
defm int_amdgcn_flat_atomic_fmin : noret_op;
@@ -627,7 +630,6 @@ defm int_amdgcn_global_atomic_fmin : noret_op;
defm int_amdgcn_global_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
-defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
@@ -637,9 +639,14 @@ defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
-multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
+multiclass noret_binary_atomic_op<SDNode atomic_op> {
let HasNoUse = true in
- defm "_noret" : binary_atomic_op<atomic_op, IsInt>;
+ defm "_noret" : binary_atomic_op<atomic_op>;
+}
+
+multiclass noret_binary_atomic_op_fp<SDNode atomic_op> {
+ let HasNoUse = true in
+ defm "_noret" : binary_atomic_op_fp<atomic_op>;
}
multiclass noret_ternary_atomic_op<SDNode atomic_op> {
@@ -647,11 +654,21 @@ multiclass noret_ternary_atomic_op<SDNode atomic_op> {
defm "_noret" : ternary_atomic_op<atomic_op>;
}
-multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> {
- foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+defvar atomic_addrspace_names = [ "global", "flat", "constant", "local", "private", "region" ];
+
+multiclass binary_atomic_op_all_as<SDNode atomic_op> {
+ foreach as = atomic_addrspace_names in {
+ let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
+ defm "_"#as : binary_atomic_op<atomic_op>;
+ defm "_"#as : noret_binary_atomic_op<atomic_op>;
+ }
+ }
+}
+multiclass binary_atomic_op_fp_all_as<SDNode atomic_op> {
+ foreach as = atomic_addrspace_names in {
let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
- defm "_"#as : binary_atomic_op<atomic_op, IsInt>;
- defm "_"#as : noret_binary_atomic_op<atomic_op, IsInt>;
+ defm "_"#as : binary_atomic_op_fp<atomic_op>;
+ defm "_"#as : noret_binary_atomic_op_fp<atomic_op>;
}
}
}
@@ -666,11 +683,11 @@ defm atomic_load_sub : binary_atomic_op_all_as<atomic_load_sub>;
defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>;
defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>;
defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>;
-defm atomic_load_fadd : binary_atomic_op_all_as<atomic_load_fadd, 0>;
+defm atomic_load_fadd : binary_atomic_op_fp_all_as<atomic_load_fadd>;
+defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
+defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
-let MemoryVT = v2f16 in
-defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as<atomic_load_fadd, 0>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee7fb20c23aa..f1254b2e9e1d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -283,7 +283,9 @@ static const LLT S1 = LLT::scalar(1);
static const LLT S8 = LLT::scalar(8);
static const LLT S16 = LLT::scalar(16);
static const LLT S32 = LLT::scalar(32);
+static const LLT F32 = LLT::float32();
static const LLT S64 = LLT::scalar(64);
+static const LLT F64 = LLT::float64();
static const LLT S96 = LLT::scalar(96);
static const LLT S128 = LLT::scalar(128);
static const LLT S160 = LLT::scalar(160);
@@ -301,6 +303,9 @@ static const LLT V10S16 = LLT::fixed_vector(10, 16);
static const LLT V12S16 = LLT::fixed_vector(12, 16);
static const LLT V16S16 = LLT::fixed_vector(16, 16);
+static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
+static const LLT V2BF16 = V2F16; // FIXME
+
static const LLT V2S32 = LLT::fixed_vector(2, 32);
static const LLT V3S32 = LLT::fixed_vector(3, 32);
static const LLT V4S32 = LLT::fixed_vector(4, 32);
@@ -1638,13 +1643,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasLdsAtomicAddF64())
Atomic.legalFor({{S64, LocalPtr}});
if (ST.hasAtomicDsPkAdd16Insts())
- Atomic.legalFor({{V2S16, LocalPtr}});
+ Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
}
if (ST.hasAtomicFaddInsts())
Atomic.legalFor({{S32, GlobalPtr}});
if (ST.hasFlatAtomicFaddF32Inst())
Atomic.legalFor({{S32, FlatPtr}});
+ getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
+ .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
+
if (ST.hasGFX90AInsts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
@@ -1656,6 +1664,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
});
}
+ if (ST.hasAtomicBufferGlobalPkAddF16Insts())
+ Atomic.legalFor({{V2F16, GlobalPtr}});
+ if (ST.hasAtomicGlobalPkAddBF16Inst())
+ Atomic.legalFor({{V2BF16, GlobalPtr}});
+ if (ST.hasAtomicFlatPkAdd16Insts())
+ Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
@@ -5388,12 +5403,10 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
switch (IID) {
- case Intrinsic::amdgcn_ds_fadd:
- return AMDGPU::G_ATOMICRMW_FADD;
case Intrinsic::amdgcn_ds_fmin:
- return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
+ return AMDGPU::G_ATOMICRMW_FMIN;
case Intrinsic::amdgcn_ds_fmax:
- return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
+ return AMDGPU::G_ATOMICRMW_FMAX;
default:
llvm_unreachable("not a DS FP intrinsic");
}
@@ -5417,6 +5430,126 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
return true;
}
+// TODO: Fix pointer type handling
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
+ MachineInstr &MI,
+ Intrinsic::ID IID) const {
+
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+
+ bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
+ IID == Intrinsic::amdgcn_permlanex16;
+
+ auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
+ Register Src2, LLT VT) -> Register {
+ auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
+ switch (IID) {
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane64:
+ return LaneOp.getReg(0);
+ case Intrinsic::amdgcn_readlane:
+ return LaneOp.addUse(Src1).getReg(0);
+ case Intrinsic::amdgcn_writelane:
+ return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16: {
+ Register Src3 = MI.getOperand(5).getReg();
+ Register Src4 = MI.getOperand(6).getImm();
+ Register Src5 = MI.getOperand(7).getImm();
+ return LaneOp.addUse(Src1)
+ .addUse(Src2)
+ .addUse(Src3)
+ .addImm(Src4)
+ .addImm(Src5)
+ .getReg(0);
+ }
+ default:
+ llvm_unreachable("unhandled lane op");
+ }
+ };
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(2).getReg();
+ Register Src1, Src2;
+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
+ IsPermLane16) {
+ Src1 = MI.getOperand(3).getReg();
+ if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
+ Src2 = MI.getOperand(4).getReg();
+ }
+ }
+
+ LLT Ty = MRI.getType(DstReg);
+ unsigned Size = Ty.getSizeInBits();
+
+ if (Size == 32) {
+ // Already legal
+ return true;
+ }
+
+ if (Size < 32) {
+ Src0 = B.buildAnyExt(S32, Src0).getReg(0);
+
+ if (IsPermLane16)
+ Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
+
+ Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
+ B.buildTrunc(DstReg, LaneOpDst);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (Size % 32 != 0)
+ return false;
+
+ LLT PartialResTy = S32;
+ if (Ty.isVector()) {
+ LLT EltTy = Ty.getElementType();
+ switch (EltTy.getSizeInBits()) {
+ case 16:
+ PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
+ break;
+ case 32:
+ PartialResTy = EltTy;
+ break;
+ default:
+ // Handle all other cases via S32 pieces;
+ break;
+ }
+ }
+
+ SmallVector<Register, 2> PartialRes;
+ unsigned NumParts = Size / 32;
+ MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+ MachineInstrBuilder Src1Parts, Src2Parts;
+
+ if (IsPermLane16)
+ Src1Parts = B.buildUnmerge(PartialResTy, Src1);
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2Parts = B.buildUnmerge(PartialResTy, Src2);
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Src0 = Src0Parts.getReg(i);
+
+ if (IsPermLane16)
+ Src1 = Src1Parts.getReg(i);
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2 = Src2Parts.getReg(i);
+
+ PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
+ }
+
+ B.buildMergeLikeInstr(DstReg, PartialRes);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -6008,9 +6141,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
- case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
- return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
@@ -6630,9 +6760,9 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
MI.removeOperand(1); // Remove intrinsic ID
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
- // TODO: Should this use datalayout alignment?
const unsigned MemSize = (Size + 7) / 8;
- const Align MemAlign(std::min(MemSize, 4u));
+ const Align MemAlign = B.getDataLayout().getABITypeAlign(
+ getTypeForLLT(Ty, MF.getFunction().getContext()));
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
@@ -7318,14 +7448,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
- case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
- case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::amdgcn_rsq_clamp:
return legalizeRsqClampIntrinsic(MI, MRI, B);
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
@@ -7365,6 +7490,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
Observer.changedInstr(MI);
return true;
}
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_permlane64:
+ return legalizeLaneOp(Helper, MI, IntrID);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1d821dadc2..ae01bb29c110 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -210,6 +210,9 @@ public:
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
Intrinsic::ID IID) const;
+ bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
+ Intrinsic::ID IID) const;
+
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index c515138d95a2..456f3cb332cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1129,15 +1129,11 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
nval = CreateCallEx(B,ExpExpr, nval, "__exp2");
if (needcopysign) {
- Value *opr_n;
- Type* rTy = opr0->getType();
Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
- Type *nTy = nTyS;
- if (const auto *vTy = dyn_cast<FixedVectorType>(rTy))
- nTy = FixedVectorType::get(nTyS, vTy);
+ Type *nTy = FPOp->getType()->getWithNewType(nTyS);
unsigned size = nTy->getScalarSizeInBits();
- opr_n = FPOp->getOperand(1);
- if (opr_n->getType()->isIntegerTy())
+ Value *opr_n = FPOp->getOperand(1);
+ if (opr_n->getType()->getScalarType()->isIntegerTy())
opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
else
opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index f878bd9465d3..a8f6ad09fe28 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -200,6 +200,7 @@
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Utils/Local.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/AttributeMask.h"
#include "llvm/IR/Constants.h"
@@ -214,6 +215,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ReplaceConstant.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/AtomicOrdering.h"
@@ -578,18 +580,14 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
/// buffer fat pointer constant.
static std::pair<Constant *, Constant *>
splitLoweredFatBufferConst(Constant *C) {
- if (auto *AZ = dyn_cast<ConstantAggregateZero>(C))
- return std::make_pair(AZ->getStructElement(0), AZ->getStructElement(1));
- if (auto *SC = dyn_cast<ConstantStruct>(C))
- return std::make_pair(SC->getOperand(0), SC->getOperand(1));
- llvm_unreachable("Conversion should've created a {p8, i32} struct");
+ assert(isSplitFatPtr(C->getType()) && "Not a split fat buffer pointer");
+ return std::make_pair(C->getAggregateElement(0u), C->getAggregateElement(1u));
}
namespace {
/// Handle the remapping of ptr addrspace(7) constants.
class FatPtrConstMaterializer final : public ValueMaterializer {
BufferFatPtrToStructTypeMap *TypeMap;
- BufferFatPtrToIntTypeMap *IntTypeMap;
// An internal mapper that is used to recurse into the arguments of constants.
// While the documentation for `ValueMapper` specifies not to use it
// recursively, examination of the logic in mapValue() shows that it can
@@ -599,16 +597,12 @@ class FatPtrConstMaterializer final : public ValueMaterializer {
Constant *materializeBufferFatPtrConst(Constant *C);
- const DataLayout &DL;
-
public:
// UnderlyingMap is the value map this materializer will be filling.
FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap,
- ValueToValueMapTy &UnderlyingMap,
- BufferFatPtrToIntTypeMap *IntTypeMap,
- const DataLayout &DL)
- : TypeMap(TypeMap), IntTypeMap(IntTypeMap),
- InternalMapper(UnderlyingMap, RF_None, TypeMap, this), DL(DL) {}
+ ValueToValueMapTy &UnderlyingMap)
+ : TypeMap(TypeMap),
+ InternalMapper(UnderlyingMap, RF_None, TypeMap, this) {}
virtual ~FatPtrConstMaterializer() = default;
Value *materialize(Value *V) override;
@@ -631,10 +625,6 @@ Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) {
UndefValue::get(NewTy->getElementType(1))});
}
- if (isa<GlobalValue>(C))
- report_fatal_error("Global values containing ptr addrspace(7) (buffer "
- "fat pointer) values are not supported");
-
if (auto *VC = dyn_cast<ConstantVector>(C)) {
if (Constant *S = VC->getSplatValue()) {
Constant *NewS = InternalMapper.mapConstant(*S);
@@ -660,127 +650,14 @@ Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) {
return ConstantStruct::get(NewTy, {RsrcVec, OffVec});
}
- // Constant expressions. This code mirrors how we fix up the equivalent
- // instructions later.
- auto *CE = dyn_cast<ConstantExpr>(C);
- if (!CE)
- return nullptr;
- if (auto *GEPO = dyn_cast<GEPOperator>(C)) {
- Constant *RemappedPtr =
- InternalMapper.mapConstant(*cast<Constant>(GEPO->getPointerOperand()));
- auto [Rsrc, Off] = splitLoweredFatBufferConst(RemappedPtr);
- Type *OffTy = Off->getType();
- bool InBounds = GEPO->isInBounds();
-
- MapVector<Value *, APInt> VariableOffs;
- APInt NewConstOffVal = APInt::getZero(BufferOffsetWidth);
- if (!GEPO->collectOffset(DL, BufferOffsetWidth, VariableOffs,
- NewConstOffVal))
- report_fatal_error(
- "Scalable vector or unsized struct in fat pointer GEP");
- Constant *OffAccum = nullptr;
- // Accumulate offsets together before adding to the base in order to
- // preserve as many of the inbounds properties as possible.
- for (auto [Arg, Multiple] : VariableOffs) {
- Constant *NewArg = InternalMapper.mapConstant(*cast<Constant>(Arg));
- NewArg = ConstantFoldIntegerCast(NewArg, OffTy, /*IsSigned=*/true, DL);
- if (!Multiple.isOne()) {
- if (Multiple.isPowerOf2()) {
- NewArg = ConstantExpr::getShl(
- NewArg,
- CE->getIntegerValue(
- OffTy, APInt(BufferOffsetWidth, Multiple.logBase2())),
- /*hasNUW=*/InBounds, /*HasNSW=*/InBounds);
- } else {
- NewArg =
- ConstantExpr::getMul(NewArg, CE->getIntegerValue(OffTy, Multiple),
- /*hasNUW=*/InBounds, /*hasNSW=*/InBounds);
- }
- }
- if (OffAccum) {
- OffAccum = ConstantExpr::getAdd(OffAccum, NewArg, /*hasNUW=*/InBounds,
- /*hasNSW=*/InBounds);
- } else {
- OffAccum = NewArg;
- }
- }
- Constant *NewConstOff = CE->getIntegerValue(OffTy, NewConstOffVal);
- if (OffAccum)
- OffAccum = ConstantExpr::getAdd(OffAccum, NewConstOff,
- /*hasNUW=*/InBounds, /*hasNSW=*/InBounds);
- else
- OffAccum = NewConstOff;
- bool HasNonNegativeOff = false;
- if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) {
- HasNonNegativeOff = !CI->isNegative();
- }
- Constant *NewOff = ConstantExpr::getAdd(
- Off, OffAccum, /*hasNUW=*/InBounds && HasNonNegativeOff,
- /*hasNSW=*/false);
- return ConstantStruct::get(NewTy, {Rsrc, NewOff});
- }
-
- if (auto *PI = dyn_cast<PtrToIntOperator>(CE)) {
- Constant *Parts =
- InternalMapper.mapConstant(*cast<Constant>(PI->getPointerOperand()));
- auto [Rsrc, Off] = splitLoweredFatBufferConst(Parts);
- // Here, we take advantage of the fact that ptrtoint has a built-in
- // zero-extension behavior.
- unsigned FatPtrWidth =
- DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER);
- Constant *RsrcInt = CE->getPtrToInt(Rsrc, SrcTy);
- unsigned Width = SrcTy->getScalarSizeInBits();
- Constant *Shift =
- CE->getIntegerValue(SrcTy, APInt(Width, BufferOffsetWidth));
- Constant *OffCast =
- ConstantFoldIntegerCast(Off, SrcTy, /*IsSigned=*/false, DL);
- Constant *RsrcHi = ConstantExpr::getShl(
- RsrcInt, Shift, Width >= FatPtrWidth, Width > FatPtrWidth);
- // This should be an or, but those got recently removed.
- Constant *Result = ConstantExpr::getAdd(RsrcHi, OffCast, true, true);
- return Result;
- }
+ if (isa<GlobalValue>(C))
+ report_fatal_error("Global values containing ptr addrspace(7) (buffer "
+ "fat pointer) values are not supported");
- if (CE->getOpcode() == Instruction::IntToPtr) {
- auto *Arg = cast<Constant>(CE->getOperand(0));
- unsigned FatPtrWidth =
- DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER);
- unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE);
- auto *WantedTy = Arg->getType()->getWithNewBitWidth(FatPtrWidth);
- Arg = ConstantFoldIntegerCast(Arg, WantedTy, /*IsSigned=*/false, DL);
-
- Constant *Shift =
- CE->getIntegerValue(WantedTy, APInt(FatPtrWidth, BufferOffsetWidth));
- Type *RsrcIntType = WantedTy->getWithNewBitWidth(RsrcPtrWidth);
- Type *RsrcTy = NewTy->getElementType(0);
- Type *OffTy = WantedTy->getWithNewBitWidth(BufferOffsetWidth);
- Constant *RsrcInt = CE->getTrunc(
- ConstantFoldBinaryOpOperands(Instruction::LShr, Arg, Shift, DL),
- RsrcIntType);
- Constant *Rsrc = CE->getIntToPtr(RsrcInt, RsrcTy);
- Constant *Off = ConstantFoldIntegerCast(Arg, OffTy, /*isSigned=*/false, DL);
-
- return ConstantStruct::get(NewTy, {Rsrc, Off});
- }
+ if (isa<ConstantExpr>(C))
+ report_fatal_error("Constant exprs containing ptr addrspace(7) (buffer "
+ "fat pointer) values should have been expanded earlier");
- if (auto *AC = dyn_cast<AddrSpaceCastOperator>(CE)) {
- unsigned SrcAS = AC->getSrcAddressSpace();
- unsigned DstAS = AC->getDestAddressSpace();
- auto *Arg = cast<Constant>(AC->getPointerOperand());
- auto *NewArg = InternalMapper.mapConstant(*Arg);
- if (!NewArg)
- return nullptr;
- if (SrcAS == AMDGPUAS::BUFFER_FAT_POINTER &&
- DstAS == AMDGPUAS::BUFFER_FAT_POINTER)
- return NewArg;
- if (SrcAS == AMDGPUAS::BUFFER_RESOURCE &&
- DstAS == AMDGPUAS::BUFFER_FAT_POINTER) {
- auto *NullOff = CE->getNullValue(NewTy->getElementType(1));
- return ConstantStruct::get(NewTy, {NewArg, NullOff});
- }
- report_fatal_error(
- "Unsupported address space cast for a buffer fat pointer");
- }
return nullptr;
}
@@ -788,26 +665,6 @@ Value *FatPtrConstMaterializer::materialize(Value *V) {
Constant *C = dyn_cast<Constant>(V);
if (!C)
return nullptr;
- if (auto *GEPO = dyn_cast<GEPOperator>(C)) {
- // As a special case, adjust GEP constants that have a ptr addrspace(7) in
- // their source types here, since the earlier local changes didn't handle
- // htis.
- Type *SrcTy = GEPO->getSourceElementType();
- Type *NewSrcTy = IntTypeMap->remapType(SrcTy);
- if (SrcTy != NewSrcTy) {
- SmallVector<Constant *> Ops;
- Ops.reserve(GEPO->getNumOperands());
- for (const Use &U : GEPO->operands())
- Ops.push_back(cast<Constant>(U.get()));
- auto *NewGEP = ConstantExpr::getGetElementPtr(
- NewSrcTy, Ops[0], ArrayRef<Constant *>(Ops).slice(1),
- GEPO->getNoWrapFlags(), GEPO->getInRange());
- LLVM_DEBUG(dbgs() << "p7-getting GEP: " << *GEPO << " becomes " << *NewGEP
- << "\n");
- Value *FurtherMap = materialize(NewGEP);
- return FurtherMap ? FurtherMap : NewGEP;
- }
- }
// Structs and other types that happen to contain fat pointers get remapped
// by the mapValue() logic.
if (!isBufferFatPtrConst(C))
@@ -1387,57 +1244,25 @@ PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) {
}
PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+ using namespace llvm::PatternMatch;
Value *Ptr = GEP.getPointerOperand();
if (!isSplitFatPtr(Ptr->getType()))
return {nullptr, nullptr};
IRB.SetInsertPoint(&GEP);
auto [Rsrc, Off] = getPtrParts(Ptr);
- Type *OffTy = Off->getType();
const DataLayout &DL = GEP.getModule()->getDataLayout();
bool InBounds = GEP.isInBounds();
- // In order to call collectOffset() and thus not have to reimplement it,
- // we need the GEP's pointer operand to have ptr addrspace(7) type
- GEP.setOperand(GEP.getPointerOperandIndex(),
- PoisonValue::get(IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER)));
- MapVector<Value *, APInt> VariableOffs;
- APInt ConstOffVal = APInt::getZero(BufferOffsetWidth);
- if (!GEP.collectOffset(DL, BufferOffsetWidth, VariableOffs, ConstOffVal))
- report_fatal_error("Scalable vector or unsized struct in fat pointer GEP");
- GEP.setOperand(GEP.getPointerOperandIndex(), Ptr);
- Value *OffAccum = nullptr;
- // Accumulate offsets together before adding to the base in order to preserve
- // as many of the inbounds properties as possible.
- for (auto [Arg, Multiple] : VariableOffs) {
- if (auto *OffVecTy = dyn_cast<VectorType>(OffTy))
- if (!Arg->getType()->isVectorTy())
- Arg = IRB.CreateVectorSplat(OffVecTy->getElementCount(), Arg);
- Arg = IRB.CreateIntCast(Arg, OffTy, /*isSigned=*/true);
- if (!Multiple.isOne()) {
- if (Multiple.isPowerOf2())
- Arg = IRB.CreateShl(Arg, Multiple.logBase2(), "", /*hasNUW=*/InBounds,
- /*HasNSW=*/InBounds);
- else
- Arg = IRB.CreateMul(Arg, ConstantExpr::getIntegerValue(OffTy, Multiple),
- "", /*hasNUW=*/InBounds, /*hasNSW=*/InBounds);
- }
- if (OffAccum)
- OffAccum = IRB.CreateAdd(OffAccum, Arg, "", /*hasNUW=*/InBounds,
- /*hasNSW=*/InBounds);
- else
- OffAccum = Arg;
- }
- if (!ConstOffVal.isZero()) {
- Constant *ConstOff = ConstantExpr::getIntegerValue(OffTy, ConstOffVal);
- if (OffAccum)
- OffAccum = IRB.CreateAdd(OffAccum, ConstOff, "", /*hasNUW=*/InBounds,
- /*hasNSW=*/InBounds);
- else
- OffAccum = ConstOff;
- }
-
- if (!OffAccum) { // Constant-zero offset
+ // In order to call emitGEPOffset() and thus not have to reimplement it,
+ // we need the GEP result to have ptr addrspace(7) type.
+ Type *FatPtrTy = IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER);
+ if (auto *VT = dyn_cast<VectorType>(Off->getType()))
+ FatPtrTy = VectorType::get(FatPtrTy, VT->getElementCount());
+ GEP.mutateType(FatPtrTy);
+ Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP);
+ GEP.mutateType(Ptr->getType());
+ if (match(OffAccum, m_Zero())) { // Constant-zero offset
SplitUsers.insert(&GEP);
return {Rsrc, Off};
}
@@ -1447,7 +1272,7 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
HasNonNegativeOff = !CI->isNegative();
}
Value *NewOff;
- if (PatternMatch::match(Off, PatternMatch::is_zero())) {
+ if (match(Off, m_Zero())) {
NewOff = OffAccum;
} else {
NewOff = IRB.CreateAdd(Off, OffAccum, "",
@@ -1473,20 +1298,22 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) {
const DataLayout &DL = PI.getModule()->getDataLayout();
unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER);
- Value *RsrcInt;
- if (Width <= BufferOffsetWidth)
- RsrcInt = ConstantExpr::getIntegerValue(ResTy, APInt::getZero(Width));
- else
- RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc");
- copyMetadata(RsrcInt, &PI);
-
- Value *Shl = IRB.CreateShl(
- RsrcInt,
- ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), "",
- Width >= FatPtrWidth, Width > FatPtrWidth);
- Value *OffCast =
- IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off");
- Value *Res = IRB.CreateOr(Shl, OffCast);
+ Value *Res;
+ if (Width <= BufferOffsetWidth) {
+ Res = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false,
+ PI.getName() + ".off");
+ } else {
+ Value *RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc");
+ Value *Shl = IRB.CreateShl(
+ RsrcInt,
+ ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)),
+ "", Width >= FatPtrWidth, Width > FatPtrWidth);
+ Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false,
+ PI.getName() + ".off");
+ Res = IRB.CreateOr(Shl, OffCast);
+ }
+
+ copyMetadata(Res, &PI);
Res->takeName(&PI);
SplitUsers.insert(&PI);
PI.replaceAllUsesWith(Res);
@@ -1818,14 +1645,9 @@ public:
static bool containsBufferFatPointers(const Function &F,
BufferFatPtrToStructTypeMap *TypeMap) {
bool HasFatPointers = false;
- for (const BasicBlock &BB : F) {
- for (const Instruction &I : BB) {
+ for (const BasicBlock &BB : F)
+ for (const Instruction &I : BB)
HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType()));
- for (const Use &U : I.operands())
- if (auto *C = dyn_cast<Constant>(U.get()))
- HasFatPointers |= isBufferFatPtrConst(C);
- }
- }
return HasFatPointers;
}
@@ -1924,6 +1746,36 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
"buffer resource pointers (address space 8) instead.");
}
+ {
+ // Collect all constant exprs and aggregates referenced by any function.
+ SmallVector<Constant *, 8> Worklist;
+ for (Function &F : M.functions())
+ for (Instruction &I : instructions(F))
+ for (Value *Op : I.operands())
+ if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op))
+ Worklist.push_back(cast<Constant>(Op));
+
+ // Recursively look for any referenced buffer pointer constants.
+ SmallPtrSet<Constant *, 8> Visited;
+ SetVector<Constant *> BufferFatPtrConsts;
+ while (!Worklist.empty()) {
+ Constant *C = Worklist.pop_back_val();
+ if (!Visited.insert(C).second)
+ continue;
+ if (isBufferFatPtrOrVector(C->getType()))
+ BufferFatPtrConsts.insert(C);
+ for (Value *Op : C->operands())
+ if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op))
+ Worklist.push_back(cast<Constant>(Op));
+ }
+
+ // Expand all constant expressions using fat buffer pointers to
+ // instructions.
+ Changed |= convertUsersOfConstantsToInstructions(
+ BufferFatPtrConsts.getArrayRef(), /*RestrictToFunc=*/nullptr,
+ /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
+ }
+
StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
for (Function &F : M.functions()) {
bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
@@ -1939,7 +1791,7 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
SmallVector<Function *> Intrinsics;
// Keep one big map so as to memoize constants across functions.
ValueToValueMapTy CloneMap;
- FatPtrConstMaterializer Materializer(&StructTM, CloneMap, &IntTM, DL);
+ FatPtrConstMaterializer Materializer(&StructTM, CloneMap);
ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer);
for (auto [F, InterfaceChange] : NeedsRemap) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
index 6ec4178053b2..11f0cba47afd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -17,6 +17,157 @@
using namespace llvm;
+void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI,
+ std::optional<unsigned int> OpIdx, int64_t Imm) const {
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_DELAY_ALU:
+ assert(OpIdx == 0);
+ printSDelayAluImm(Imm, OS);
+ break;
+ default:
+ MIRFormatter::printImm(OS, MI, OpIdx, Imm);
+ break;
+ }
+}
+
+/// Implement target specific parsing of immediate mnemonics. The mnemonic is
+/// a string with a leading dot.
+bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode,
+ const unsigned OpIdx,
+ StringRef Src, int64_t &Imm,
+ ErrorCallbackType ErrorCallback) const
+{
+
+ switch (OpCode) {
+ case AMDGPU::S_DELAY_ALU:
+ return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
+ default:
+ break;
+ }
+ return true; // Don't know what this is
+}
+
+void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm,
+ llvm::raw_ostream &OS) const {
+ // Construct an immediate string to represent the information encoded in the
+ // s_delay_alu immediate.
+ // .id0_<dep>[_skip_<count>_id1<dep>]
+ constexpr int64_t None = 0;
+ constexpr int64_t Same = 0;
+
+ uint64_t Id0 = (Imm & 0xF);
+ uint64_t Skip = ((Imm >> 4) & 0x7);
+ uint64_t Id1 = ((Imm >> 7) & 0xF);
+ auto Outdep = [&](uint64_t Id) {
+ if (Id == None)
+ OS << "NONE";
+ else if (Id < 5)
+ OS << "VALU_DEP_" << Id;
+ else if (Id < 8)
+ OS << "TRANS32_DEP_" << Id - 4;
+ else
+ OS << "SALU_CYCLE_" << Id - 8;
+ };
+
+ OS << ".id0_";
+ Outdep(Id0);
+
+ // If the second inst is "same" and "none", no need to print the rest of the
+ // string.
+ if (Skip == Same && Id1 == None)
+ return;
+
+ // Encode the second delay specification.
+ OS << "_skip_";
+ if (Skip == 0)
+ OS << "SAME";
+ else if (Skip == 1)
+ OS << "NEXT";
+ else
+ OS << "SKIP_" << Skip - 1;
+
+ OS << "_id1_";
+ Outdep(Id1);
+}
+
+bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic(
+ const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
+ llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const
+{
+ assert(OpIdx == 0);
+
+ Imm = 0;
+ bool Expected = Src.consume_front(".id0_");
+ if (!Expected)
+ return ErrorCallback(Src.begin(), "Expected .id0_");
+
+ auto ExpectInt = [&](StringRef &Src, int64_t Offset) -> int64_t {
+ int64_t Dep;
+ if (!Src.consumeInteger(10, Dep))
+ return Dep + Offset;
+
+ return -1;
+ };
+
+ auto DecodeDelay = [&](StringRef &Src) -> int64_t {
+ if (Src.consume_front("NONE"))
+ return 0;
+ if (Src.consume_front("VALU_DEP_"))
+ return ExpectInt(Src, 0);
+ if (Src.consume_front("TRANS32_DEP_"))
+ return ExpectInt(Src, 4);
+ if (Src.consume_front("SALU_CYCLE_"))
+ return ExpectInt(Src, 8);
+
+ return -1;
+ };
+
+ int64_t Delay0 = DecodeDelay(Src);
+ int64_t Skip = 0;
+ int64_t Delay1 = 0;
+ if (Delay0 == -1)
+ return ErrorCallback(Src.begin(), "Could not decode delay0");
+
+
+ // Set the Imm so far, to that early return has the correct value.
+ Imm = Delay0;
+
+ // If that was the end of the string, the second instruction is "same" and
+ // "none"
+ if (Src.begin() == Src.end())
+ return false;
+
+ Expected = Src.consume_front("_skip_");
+ if (!Expected)
+ return ErrorCallback(Src.begin(), "Expected _skip_");
+
+
+ if (Src.consume_front("SAME")) {
+ Skip = 0;
+ } else if (Src.consume_front("NEXT")) {
+ Skip = 1;
+ } else if (Src.consume_front("SKIP_")) {
+ if (Src.consumeInteger(10, Skip)) {
+ return ErrorCallback(Src.begin(), "Expected integer Skip value");
+ }
+ Skip += 1;
+ } else {
+ ErrorCallback(Src.begin(), "Unexpected Skip Value");
+ }
+
+ Expected = Src.consume_front("_id1_");
+ if (!Expected)
+ return ErrorCallback(Src.begin(), "Expected _id1_");
+
+ Delay1 = DecodeDelay(Src);
+ if (Delay1 == -1)
+ return ErrorCallback(Src.begin(), "Could not decode delay1");
+
+ Imm = Imm | (Skip << 4) | (Delay1 << 7);
+ return false;
+}
+
bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue(
StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index 98b5031071cf..c5c947375252 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -28,12 +28,35 @@ public:
AMDGPUMIRFormatter() = default;
virtual ~AMDGPUMIRFormatter() = default;
+ /// Implement target specific printing for machine operand immediate value, so
+ /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
+ /// None to OpIdx means the index is unknown.
+ virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
+ std::optional<unsigned> OpIdx,
+ int64_t Imm) const override;
+
+ /// Implement target specific parsing of immediate mnemonics. The mnemonic is
+ /// a string with a leading dot.
+ virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+ StringRef Src, int64_t &Imm,
+ ErrorCallbackType ErrorCallback) const override;
+
/// Implement target specific parsing of target custom pseudo source value.
bool
parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF,
PerFunctionMIParsingState &PFS,
const PseudoSourceValue *&PSV,
ErrorCallbackType ErrorCallback) const override;
+
+private:
+ /// Print the string to represent s_delay_alu immediate value
+ void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const;
+
+ /// Parse the immediate pseudo literal for s_delay_alu
+ bool parseSDelayAluImmMnemonic(
+ const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
+ llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const;
+
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index f36374b08b34..cfe9f33efc91 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -100,7 +100,7 @@ public:
bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const;
// Combine unsigned buffer load and signed extension instructions to generate
- // signed buffer laod instructions.
+ // signed buffer load instructions.
bool matchCombineSignExtendInReg(
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
void applyCombineSignExtendInReg(
@@ -465,8 +465,8 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
}
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -494,7 +494,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ IsOptNone ? nullptr
+ : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 3f01a328afaf..4d0cb467ba37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -238,8 +238,8 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
}
AU.addRequired<GISelCSEAnalysisWrapperPass>();
@@ -272,7 +272,8 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>();
MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ IsOptNone ? nullptr
+ : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize());
AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 35abd6eddde8..74f0540239c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -421,8 +421,8 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
}
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -449,7 +449,8 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
const auto *LI = ST.getLegalizerInfo();
MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ IsOptNone ? nullptr
+ : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
index 2ea03ddb1fcc..d1985f46b1c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
@@ -33,7 +33,7 @@ StringRef AMDGPURegBankSelect::getPassName() const {
void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<MachineCycleInfoWrapperPass>();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
// TODO: Preserve DomTree
RegBankSelect::getAnalysisUsage(AU);
}
@@ -41,7 +41,7 @@ void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE,
"AMDGPU Register Bank Select", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE,
"AMDGPU Register Bank Select", false, false)
@@ -63,7 +63,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MachineCycleInfo &CycleInfo =
getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
- MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree &DomTree =
+ getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MachineUniformityInfo Uniformity =
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 7ebd674757fb..9e7694f41d6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3079,7 +3079,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
- case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
applyDefaultMapping(OpdMapper);
@@ -4376,7 +4375,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
- case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
// vdata_out
@@ -4907,8 +4905,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_load_tr_b128:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
- case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_ds_fadd_v2bf16: {
+ case Intrinsic::amdgcn_ds_ordered_swap: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
@@ -5221,11 +5218,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_UMAX:
case AMDGPU::G_ATOMICRMW_UMIN:
case AMDGPU::G_ATOMICRMW_FADD:
+ case AMDGPU::G_ATOMICRMW_FMIN:
+ case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
- case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
- case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
- case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
+ case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 410dc83d45c5..ed5bae3e4ff6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -252,21 +252,8 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_ds_fadd>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
-def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
@@ -280,7 +267,6 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
@@ -298,7 +284,6 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
@@ -316,7 +301,6 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
@@ -334,12 +318,10 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 2449fa581842..3e5d83b8e3fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -15,10 +15,9 @@
/// SplitModule: load-balance the module's functions across a set of N
/// partitions to allow parallel codegen. However, it does it very
/// differently than the target-agnostic variant:
-/// - Kernels are used as the module's "roots".
-/// They're known entry points on AMDGPU, and everything else is often
-/// internal only.
-/// - Each kernel has a set of dependencies, and when a kernel and its
+/// - The module has "split roots", which are kernels in the vast
+// majority of cases.
+/// - Each root has a set of dependencies, and when a root and its
/// dependencies is considered "big", we try to put it in a partition where
/// most dependencies are already imported, to avoid duplicating large
/// amounts of code.
@@ -67,20 +66,22 @@ using namespace llvm;
namespace {
-static cl::opt<float> LargeKernelFactor(
- "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
+static cl::opt<float> LargeFnFactor(
+ "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
cl::Hidden,
cl::desc(
- "consider a kernel as large and needing special treatment when it "
+ "consider a function as large and needing special treatment when the "
+ "cost of importing it into a partition"
"exceeds the average cost of a partition by this factor; e;g. 2.0 "
- "means if the kernel and its dependencies is 2 times bigger than "
- "an average partition; 0 disables large kernels handling entirely"));
+ "means if the function and its dependencies is 2 times bigger than "
+ "an average partition; 0 disables large functions handling entirely"));
-static cl::opt<float> LargeKernelOverlapForMerge(
- "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
+static cl::opt<float> LargeFnOverlapForMerge(
+ "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
cl::Hidden,
- cl::desc("defines how much overlap between two large kernel's dependencies "
- "is needed to put them in the same partition"));
+ cl::desc(
+ "defines how much overlap between two large function's dependencies "
+ "is needed to put them in the same partition"));
static cl::opt<bool> NoExternalizeGlobals(
"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
@@ -98,6 +99,7 @@ static cl::opt<bool>
using CostType = InstructionCost::CostType;
using PartitionID = unsigned;
+using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
static bool isEntryPoint(const Function *F) {
return AMDGPU::isEntryFunctionCC(F->getCallingConv());
@@ -214,13 +216,12 @@ static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
/// Calculate the cost of each function in \p M
/// \param SML Log Helper
-/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
+/// \param GetTTI Abstract getter for TargetTransformInfo.
/// \param M Module to analyze.
/// \param CostMap[out] Resulting Function -> Cost map.
/// \return The module's total cost.
static CostType
-calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
- Module &M,
+calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
DenseMap<const Function *, CostType> &CostMap) {
CostType ModuleCost = 0;
CostType KernelCost = 0;
@@ -230,8 +231,7 @@ calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
continue;
CostType FnCost = 0;
- TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn);
-
+ const auto &TTI = GetTTI(Fn);
for (const auto &BB : Fn) {
for (const auto &I : BB) {
auto Cost =
@@ -277,9 +277,9 @@ static bool canBeIndirectlyCalled(const Function &F) {
/*IgnoreCastedDirectCall=*/true);
}
-/// When a kernel or any of its callees performs an indirect call, this function
+/// When a function or any of its callees performs an indirect call, this
/// takes over \ref addAllDependencies and adds all potentially callable
-/// functions to \p Fns so they can be counted as dependencies of the kernel.
+/// functions to \p Fns so they can be counted as dependencies of the function.
///
/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
/// presence of an indirect call, the function's resource usage is the same as
@@ -301,13 +301,14 @@ static void addAllIndirectCallDependencies(const Module &M,
/// \param CG Call graph for \p Fn's module.
/// \param Fn Current function to look at.
/// \param Fns[out] Resulting list of functions.
+/// \param OnlyDirect Whether to only consider direct callees.
/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
/// point, either in \p Fn or in one of the function it calls. When that
/// happens, we fall back to adding all callable functions inside \p Fn's module
/// to \p Fns.
static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
const Function &Fn,
- DenseSet<const Function *> &Fns,
+ DenseSet<const Function *> &Fns, bool OnlyDirect,
bool &HadIndirectCall) {
assert(!Fn.isDeclaration());
@@ -325,6 +326,9 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
auto *CGNode = CGEntry.second;
auto *Callee = CGNode->getFunction();
if (!Callee) {
+ if (OnlyDirect)
+ continue;
+
// Functions have an edge towards CallsExternalNode if they're external
// declarations, or if they do an indirect call. As we only process
// definitions here, we know this means the function has an indirect
@@ -353,13 +357,19 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
}
}
-/// Contains information about a kernel and its dependencies.
-struct KernelWithDependencies {
- KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
- const DenseMap<const Function *, CostType> &FnCosts,
- const Function *Fn)
+/// Contains information about a function and its dependencies.
+/// This is a splitting root. The splitting algorithm works by
+/// assigning these to partitions.
+struct FunctionWithDependencies {
+ FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
+ const DenseMap<const Function *, CostType> &FnCosts,
+ const Function *Fn)
: Fn(Fn) {
- addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
+ // When Fn is not a kernel, we don't need to collect indirect callees.
+ // Resource usage analysis is only performed on kernels, and we collect
+ // indirect callees for resource usage analysis.
+ addAllDependencies(SML, CG, *Fn, Dependencies,
+ /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
TotalCost = FnCosts.at(Fn);
for (const auto *Dep : Dependencies) {
TotalCost += FnCosts.at(Dep);
@@ -380,8 +390,8 @@ struct KernelWithDependencies {
CostType TotalCost = 0;
- /// \returns true if this kernel and its dependencies can be considered large
- /// according to \p Threshold.
+ /// \returns true if this function and its dependencies can be considered
+ /// large according to \p Threshold.
bool isLarge(CostType Threshold) const {
return TotalCost > Threshold && !Dependencies.empty();
}
@@ -420,39 +430,39 @@ static float calculateOverlap(const DenseSet<const Function *> &A,
/// \param NumParts Number of partitions to create.
/// \param ModuleCost Total cost of all functions in \p M.
/// \param FnCosts Map of Function -> Cost
-/// \param WorkList Kernels and their dependencies to process in order.
+/// \param WorkList Functions and their dependencies to process in order.
/// \returns The created partitions (a vector of size \p NumParts )
static std::vector<DenseSet<const Function *>>
doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
CostType ModuleCost,
const DenseMap<const Function *, CostType> &FnCosts,
- const SmallVector<KernelWithDependencies> &WorkList) {
+ const SmallVector<FunctionWithDependencies> &WorkList) {
SML << "\n--Partitioning Starts--\n";
- // Calculate a "large kernel threshold". When more than one kernel's total
- // import cost exceeds this value, we will try to merge it with other,
- // similarly large kernels.
+ // Calculate a "large function threshold". When more than one function's total
+ // import cost exceeds this value, we will try to assign it to an existing
+ // partition to reduce the amount of duplication needed.
//
- // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
+ // e.g. let two functions X and Y have a import cost of ~10% of the module, we
// assign X to a partition as usual, but when we get to Y, we check if it's
// worth also putting it in Y's partition.
- const CostType LargeKernelThreshold =
- LargeKernelFactor ? CostType(((ModuleCost / NumParts) * LargeKernelFactor))
- : std::numeric_limits<CostType>::max();
+ const CostType LargeFnThreshold =
+ LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
+ : std::numeric_limits<CostType>::max();
std::vector<DenseSet<const Function *>> Partitions;
Partitions.resize(NumParts);
- // Assign a partition to each kernel, and try to keep the partitions more or
+ // Assign functions to partitions, and try to keep the partitions more or
// less balanced. We do that through a priority queue sorted in reverse, so we
// can always look at the partition with the least content.
//
// There are some cases where we will be deliberately unbalanced though.
- // - Large kernels: we try to merge with existing partitions to reduce code
+ // - Large functions: we try to merge with existing partitions to reduce code
// duplication.
- // - Kernels with indirect or external calls always go in the first partition
- // (P0).
+ // - Functions with indirect or external calls always go in the first
+ // partition (P0).
auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
const std::pair<PartitionID, CostType> &b) {
// When two partitions have the same cost, assign to the one with the
@@ -471,17 +481,17 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
for (unsigned I = 0; I < NumParts; ++I)
BalancingQueue.push_back(std::make_pair(I, 0));
- // Helper function to handle assigning a kernel to a partition. This takes
+ // Helper function to handle assigning a function to a partition. This takes
// care of updating the balancing queue.
const auto AssignToPartition = [&](PartitionID PID,
- const KernelWithDependencies &KWD) {
+ const FunctionWithDependencies &FWD) {
auto &FnsInPart = Partitions[PID];
- FnsInPart.insert(KWD.Fn);
- FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
+ FnsInPart.insert(FWD.Fn);
+ FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
- SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> ";
- if (!KWD.Dependencies.empty()) {
- SML << KWD.Dependencies.size() << " dependencies added\n";
+ SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> ";
+ if (!FWD.Dependencies.empty()) {
+ SML << FWD.Dependencies.size() << " dependencies added\n";
};
// Update the balancing queue. we scan backwards because in the common case
@@ -506,44 +516,43 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
sort(BalancingQueue, ComparePartitions);
};
- for (auto &CurKernel : WorkList) {
- // When a kernel has indirect calls, it must stay in the first partition
+ for (auto &CurFn : WorkList) {
+ // When a function has indirect calls, it must stay in the first partition
// alongside every reachable non-entry function. This is a nightmare case
// for splitting as it severely limits what we can do.
- if (CurKernel.HasIndirectCall) {
- SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
+ if (CurFn.HasIndirectCall) {
+ SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
<< " defaulting to P0\n";
- AssignToPartition(0, CurKernel);
+ AssignToPartition(0, CurFn);
continue;
}
- // When a kernel has non duplicatable dependencies, we have to keep it in
+ // When a function has non duplicatable dependencies, we have to keep it in
// the first partition as well. This is a conservative approach, a
// finer-grained approach could keep track of which dependencies are
// non-duplicatable exactly and just make sure they're grouped together.
- if (CurKernel.HasNonDuplicatableDependecy) {
- SML << "Kernel with externally visible dependency "
- << getName(*CurKernel.Fn) << " defaulting to P0\n";
- AssignToPartition(0, CurKernel);
+ if (CurFn.HasNonDuplicatableDependecy) {
+ SML << "Function with externally visible dependency "
+ << getName(*CurFn.Fn) << " defaulting to P0\n";
+ AssignToPartition(0, CurFn);
continue;
}
- // Be smart with large kernels to avoid duplicating their dependencies.
- if (CurKernel.isLarge(LargeKernelThreshold)) {
- assert(LargeKernelOverlapForMerge >= 0.0f &&
- LargeKernelOverlapForMerge <= 1.0f);
- SML << "Large Kernel: " << getName(*CurKernel.Fn)
+ // Be smart with large functions to avoid duplicating their dependencies.
+ if (CurFn.isLarge(LargeFnThreshold)) {
+ assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
+ SML << "Large Function: " << getName(*CurFn.Fn)
<< " - looking for partition with at least "
- << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n";
+ << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
bool Assigned = false;
for (const auto &[PID, Fns] : enumerate(Partitions)) {
- float Overlap = calculateOverlap(CurKernel.Dependencies, Fns);
+ float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P"
<< PID << '\n';
- if (Overlap > LargeKernelOverlapForMerge) {
+ if (Overlap > LargeFnOverlapForMerge) {
SML << " selecting P" << PID << '\n';
- AssignToPartition(PID, CurKernel);
+ AssignToPartition(PID, CurFn);
Assigned = true;
}
}
@@ -554,41 +563,34 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
// Normal "load-balancing", assign to partition with least pressure.
auto [PID, CurCost] = BalancingQueue.back();
- AssignToPartition(PID, CurKernel);
+ AssignToPartition(PID, CurFn);
}
- // Work is mostly done now, verify the partioning and add all functions we may
- // have missed (= unreachable, or we don't understand how they're reached) to
- // P0.
- DenseSet<const Function *> AllFunctions;
- for (const auto &[Idx, Part] : enumerate(Partitions)) {
- CostType Cost = 0;
- for (auto *Fn : Part) {
- // external linkage functions should exclusively be in the first partition
- // at this stage. In theory, we should only ever see external linkage
- // functions here if they're kernels, or if they've been added due to a
- // kernel using indirect calls somewhere in its CallGraph.
- assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn)));
- Cost += FnCosts.at(Fn);
+ if (SML) {
+ for (const auto &[Idx, Part] : enumerate(Partitions)) {
+ CostType Cost = 0;
+ for (auto *Fn : Part)
+ Cost += FnCosts.at(Fn);
+ SML << "P" << Idx << " has a total cost of " << Cost << " ("
+ << format("%0.2f", (float(Cost) / ModuleCost) * 100)
+ << "% of source module)\n";
}
- SML << "P" << Idx << " has a total cost of " << Cost << " ("
- << format("%0.2f", (float(Cost) / ModuleCost) * 100)
- << "% of source module)\n";
- AllFunctions.insert(Part.begin(), Part.end());
+
+ SML << "--Partitioning Done--\n\n";
}
- // Add missed functions to P0. This will take care of adding things like
- // external functions with no callers in the module to P0. This should be
- // fairly rare as AMDGPU internalizes everything in most cases, so unused
- // internal functions would get removed.
+ // Check no functions were missed.
+#ifndef NDEBUG
+ DenseSet<const Function *> AllFunctions;
+ for (const auto &Part : Partitions)
+ AllFunctions.insert(Part.begin(), Part.end());
+
for (auto &Fn : M) {
if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
- SML << getName(Fn) << " has no partition assigned, defaulting to P0\n";
- Partitions[0].insert(&Fn);
+ assert(AllFunctions.contains(&Fn) && "Missed a function?!");
}
}
-
- SML << "--Partitioning Done--\n\n";
+#endif
return Partitions;
}
@@ -604,10 +606,17 @@ static void externalize(GlobalValue &GV) {
if (!GV.hasName())
GV.setName("__llvmsplit_unnamed");
}
-} // end anonymous namespace
-void llvm::splitAMDGPUModule(
- const AMDGPUTargetMachine &TM, Module &M, unsigned N,
+static bool hasDirectCaller(const Function &Fn) {
+ for (auto &U : Fn.uses()) {
+ if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
+ return true;
+ }
+ return false;
+}
+
+static void splitAMDGPUModule(
+ GetTTIFn GetTTI, Module &M, unsigned N,
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
SplitModuleLogger SML(M);
@@ -648,15 +657,36 @@ void llvm::splitAMDGPUModule(
// Start by calculating the cost of every function in the module, as well as
// the module's overall cost.
DenseMap<const Function *, CostType> FnCosts;
- const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts);
+ const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
- // Gather every kernel into a WorkList, then sort it by descending total cost
- // of the kernel so the biggest kernels are seen first.
- SmallVector<KernelWithDependencies> WorkList;
+ // First, gather ever kernel into the worklist.
+ SmallVector<FunctionWithDependencies> WorkList;
for (auto &Fn : M) {
if (isEntryPoint(&Fn) && !Fn.isDeclaration())
WorkList.emplace_back(SML, CG, FnCosts, &Fn);
}
+
+ // Then, find missing functions that need to be considered as additional
+ // roots. These can't be called in theory, but in practice we still have to
+ // handle them to avoid linker errors.
+ {
+ DenseSet<const Function *> SeenFunctions;
+ for (const auto &FWD : WorkList) {
+ SeenFunctions.insert(FWD.Fn);
+ SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+ }
+
+ for (auto &Fn : M) {
+ // If this function is not part of any kernel's dependencies and isn't
+ // directly called, consider it as a root.
+ if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
+ !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
+ WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+ }
+ }
+ }
+
+ // Sort the worklist so the most expensive roots are seen first.
sort(WorkList, [&](auto &A, auto &B) {
// Sort by total cost, and if the total cost is identical, sort
// alphabetically.
@@ -667,13 +697,20 @@ void llvm::splitAMDGPUModule(
if (SML) {
SML << "Worklist\n";
- for (const auto &KWD : WorkList) {
- SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost
- << " indirect:" << KWD.HasIndirectCall
- << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy
+ for (const auto &FWD : WorkList) {
+ SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
+ << " indirect:" << FWD.HasIndirectCall
+ << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
<< ")\n";
- for (const auto *Dep : KWD.Dependencies)
- SML << " [Dep] " << getName(*Dep) << '\n';
+ // Sort function names before printing to ensure determinism.
+ SmallVector<std::string> SortedDepNames;
+ SortedDepNames.reserve(FWD.Dependencies.size());
+ for (const auto *Dep : FWD.Dependencies)
+ SortedDepNames.push_back(getName(*Dep));
+ sort(SortedDepNames);
+
+ for (const auto &Name : SortedDepNames)
+ SML << " [dependency] " << Name << '\n';
}
}
@@ -700,16 +737,8 @@ void llvm::splitAMDGPUModule(
std::unique_ptr<Module> MPart(
CloneModule(M, VMap, [&](const GlobalValue *GV) {
// Functions go in their assigned partition.
- if (const auto *Fn = dyn_cast<Function>(GV)) {
-// Check we don't import an external linkage function in any
-// partition other than P0.
-#ifndef NDEBUG
- if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) {
- assert((I == 0) == FnsInPart.contains(Fn));
- }
-#endif
+ if (const auto *Fn = dyn_cast<Function>(GV))
return FnsInPart.contains(Fn);
- }
if (NeedsConservativeImport(GV))
return true;
@@ -742,3 +771,16 @@ void llvm::splitAMDGPUModule(
<< format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
<< "% of original module)\n";
}
+} // namespace
+
+PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ FunctionAnalysisManager &FAM =
+ MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+ splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
+ // We don't change the original module.
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
index 6171643bd4ad..d814dedd6f0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
@@ -12,18 +12,27 @@
#define LLVM_TARGET_AMDGPUSPLITMODULE_H
#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/IR/PassManager.h"
#include <memory>
namespace llvm {
-class Module;
-class AMDGPUTargetMachine;
-
/// Splits the module M into N linkable partitions. The function ModuleCallback
/// is called N times passing each individual partition as the MPart argument.
-void splitAMDGPUModule(
- const AMDGPUTargetMachine &TM, Module &M, unsigned N,
- function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback);
+class AMDGPUSplitModulePass : public PassInfoMixin<AMDGPUSplitModulePass> {
+public:
+ using ModuleCreationCallback =
+ function_ref<void(std::unique_ptr<Module> MPart)>;
+
+ AMDGPUSplitModulePass(unsigned N, ModuleCreationCallback ModuleCallback)
+ : N(N), ModuleCallback(ModuleCallback) {}
+
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+private:
+ unsigned N;
+ ModuleCreationCallback ModuleCallback;
+};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0751c8dc8b8b..a8e26f104f58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1104,6 +1104,9 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
if (hasFlatScratchInit())
NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
+
+ if (hasPrivateSegmentSize())
+ NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
}
void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce997c659094..9162e110aa10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -658,8 +658,7 @@ Error AMDGPUTargetMachine::buildCodeGenPipeline(
return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
}
-void AMDGPUTargetMachine::registerPassBuilderCallbacks(
- PassBuilder &PB, bool PopulateClassToPassNames) {
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"
@@ -829,8 +828,24 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
bool AMDGPUTargetMachine::splitModule(
Module &M, unsigned NumParts,
- function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
- splitAMDGPUModule(*this, M, NumParts, ModuleCallback);
+ function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
+ // FIXME(?): Would be better to use an already existing Analysis/PassManager,
+ // but all current users of this API don't have one ready and would need to
+ // create one anyway. Let's hide the boilerplate for now to keep it simple.
+
+ LoopAnalysisManager LAM;
+ FunctionAnalysisManager FAM;
+ CGSCCAnalysisManager CGAM;
+ ModuleAnalysisManager MAM;
+
+ PassBuilder PB(this);
+ PB.registerModuleAnalyses(MAM);
+ PB.registerFunctionAnalyses(FAM);
+ PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+ ModulePassManager MPM;
+ MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
+ MPM.run(M, MAM);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 2cfd232483a8..0f74fbc22fa8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -58,8 +58,7 @@ public:
const CGPassBuilderOption &Opts,
PassInstrumentationCallbacks *PIC) override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool PopulateClassToPassNames) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
void registerDefaultAliasAnalyses(AAManager &) override;
/// Get the integer value of a null pointer in the given address space.
@@ -76,7 +75,7 @@ public:
bool splitModule(Module &M, unsigned NumParts,
function_ref<void(std::unique_ptr<Module> MPart)>
- ModuleCallback) const override;
+ ModuleCallback) override;
};
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b..1192b49fd1f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -502,7 +502,6 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
@@ -1019,7 +1018,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const {
switch (IID) {
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_is_shared:
@@ -1041,7 +1039,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index bdb5a8d9a0a0..b08957d22ee7 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1314,6 +1314,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
/// }
private:
+ void createConstantSymbol(StringRef Id, int64_t Val);
+
bool ParseAsAbsoluteExpression(uint32_t &Ret);
bool OutOfRangeError(SMRange Range);
/// Calculate VGPR/SGPR blocks required for given target, reserved
@@ -1331,12 +1333,12 @@ private:
/// \param SGPRRange [in] Token range, used for SGPR diagnostics.
/// \param VGPRBlocks [out] Result VGPR block count.
/// \param SGPRBlocks [out] Result SGPR block count.
- bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed,
- bool FlatScrUsed, bool XNACKUsed,
+ bool calculateGPRBlocks(const FeatureBitset &Features, const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed, bool XNACKUsed,
std::optional<bool> EnableWavefrontSize32,
- unsigned NextFreeVGPR, SMRange VGPRRange,
- unsigned NextFreeSGPR, SMRange SGPRRange,
- unsigned &VGPRBlocks, unsigned &SGPRBlocks);
+ const MCExpr *NextFreeVGPR, SMRange VGPRRange,
+ const MCExpr *NextFreeSGPR, SMRange SGPRRange,
+ const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks);
bool ParseDirectiveAMDGCNTarget();
bool ParseDirectiveAMDHSACodeObjectVersion();
bool ParseDirectiveAMDHSAKernel();
@@ -1408,36 +1410,28 @@ public:
setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
- {
- // TODO: make those pre-defined variables read-only.
- // Currently there is none suitable machinery in the core llvm-mc for this.
- // MCSymbol::isRedefinable is intended for another purpose, and
- // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
- AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
- MCContext &Ctx = getContext();
- if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
- MCSymbol *Sym =
- Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_minor"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_stepping"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
- } else {
- MCSymbol *Sym =
- Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
- Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
- Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
- }
- if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
- initializeGprCountSymbol(IS_VGPR);
- initializeGprCountSymbol(IS_SGPR);
- } else
- KernelScope.initialize(getContext());
+ AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
+ if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
+ createConstantSymbol(".amdgcn.gfx_generation_number", ISA.Major);
+ createConstantSymbol(".amdgcn.gfx_generation_minor", ISA.Minor);
+ createConstantSymbol(".amdgcn.gfx_generation_stepping", ISA.Stepping);
+ } else {
+ createConstantSymbol(".option.machine_version_major", ISA.Major);
+ createConstantSymbol(".option.machine_version_minor", ISA.Minor);
+ createConstantSymbol(".option.machine_version_stepping", ISA.Stepping);
}
+ if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
+ initializeGprCountSymbol(IS_VGPR);
+ initializeGprCountSymbol(IS_SGPR);
+ } else
+ KernelScope.initialize(getContext());
+
+ for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions())
+ createConstantSymbol(Symbol, Code);
+
+ createConstantSymbol("UC_VERSION_W64_BIT", 0x2000);
+ createConstantSymbol("UC_VERSION_W32_BIT", 0x4000);
+ createConstantSymbol("UC_VERSION_MDP_BIT", 0x8000);
}
bool hasMIMG_R128() const {
@@ -2486,6 +2480,16 @@ bool AMDGPUOperand::isInlineValue() const {
// AsmParser
//===----------------------------------------------------------------------===//
+void AMDGPUAsmParser::createConstantSymbol(StringRef Id, int64_t Val) {
+ // TODO: make those pre-defined variables read-only.
+ // Currently there is none suitable machinery in the core llvm-mc for this.
+ // MCSymbol::isRedefinable is intended for another purpose, and
+ // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
+ MCContext &Ctx = getContext();
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(Id);
+ Sym->setVariableValue(MCConstantExpr::create(Val, Ctx));
+}
+
static int getRegClass(RegisterKind Is, unsigned RegWidth) {
if (Is == IS_VGPR) {
switch (RegWidth) {
@@ -5352,41 +5356,64 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
}
bool AMDGPUAsmParser::calculateGPRBlocks(
- const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed,
- bool XNACKUsed, std::optional<bool> EnableWavefrontSize32,
- unsigned NextFreeVGPR, SMRange VGPRRange, unsigned NextFreeSGPR,
- SMRange SGPRRange, unsigned &VGPRBlocks, unsigned &SGPRBlocks) {
+ const FeatureBitset &Features, const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed, bool XNACKUsed,
+ std::optional<bool> EnableWavefrontSize32, const MCExpr *NextFreeVGPR,
+ SMRange VGPRRange, const MCExpr *NextFreeSGPR, SMRange SGPRRange,
+ const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks) {
// TODO(scott.linder): These calculations are duplicated from
// AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
IsaVersion Version = getIsaVersion(getSTI().getCPU());
+ MCContext &Ctx = getContext();
- unsigned NumVGPRs = NextFreeVGPR;
- unsigned NumSGPRs = NextFreeSGPR;
+ const MCExpr *NumSGPRs = NextFreeSGPR;
+ int64_t EvaluatedSGPRs;
if (Version.Major >= 10)
- NumSGPRs = 0;
+ NumSGPRs = MCConstantExpr::create(0, Ctx);
else {
unsigned MaxAddressableNumSGPRs =
IsaInfo::getAddressableNumSGPRs(&getSTI());
- if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
- NumSGPRs > MaxAddressableNumSGPRs)
+ if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) && Version.Major >= 8 &&
+ !Features.test(FeatureSGPRInitBug) &&
+ static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs)
return OutOfRangeError(SGPRRange);
- NumSGPRs +=
- IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed);
+ const MCExpr *ExtraSGPRs =
+ AMDGPUMCExpr::createExtraSGPRs(VCCUsed, FlatScrUsed, XNACKUsed, Ctx);
+ NumSGPRs = MCBinaryExpr::createAdd(NumSGPRs, ExtraSGPRs, Ctx);
- if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
- NumSGPRs > MaxAddressableNumSGPRs)
+ if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) &&
+ (Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
+ static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs)
return OutOfRangeError(SGPRRange);
if (Features.test(FeatureSGPRInitBug))
- NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
- }
+ NumSGPRs =
+ MCConstantExpr::create(IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG, Ctx);
+ }
+
+ // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
+ // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
+ auto GetNumGPRBlocks = [&Ctx](const MCExpr *NumGPR,
+ unsigned Granule) -> const MCExpr * {
+ const MCExpr *OneConst = MCConstantExpr::create(1ul, Ctx);
+ const MCExpr *GranuleConst = MCConstantExpr::create(Granule, Ctx);
+ const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
+ const MCExpr *AlignToGPR =
+ AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+ const MCExpr *DivGPR =
+ MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
+ const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
+ return SubGPR;
+ };
- VGPRBlocks = IsaInfo::getEncodedNumVGPRBlocks(&getSTI(), NumVGPRs,
- EnableWavefrontSize32);
- SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs);
+ VGPRBlocks = GetNumGPRBlocks(
+ NextFreeVGPR,
+ IsaInfo::getVGPREncodingGranule(&getSTI(), EnableWavefrontSize32));
+ SGPRBlocks =
+ GetNumGPRBlocks(NumSGPRs, IsaInfo::getSGPREncodingGranule(&getSTI()));
return false;
}
@@ -5410,14 +5437,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
IsaVersion IVersion = getIsaVersion(getSTI().getCPU());
+ const MCExpr *ZeroExpr = MCConstantExpr::create(0, getContext());
+ const MCExpr *OneExpr = MCConstantExpr::create(1, getContext());
+
SMRange VGPRRange;
- uint64_t NextFreeVGPR = 0;
- uint64_t AccumOffset = 0;
+ const MCExpr *NextFreeVGPR = ZeroExpr;
+ const MCExpr *AccumOffset = MCConstantExpr::create(0, getContext());
uint64_t SharedVGPRCount = 0;
uint64_t PreloadLength = 0;
uint64_t PreloadOffset = 0;
SMRange SGPRRange;
- uint64_t NextFreeSGPR = 0;
+ const MCExpr *NextFreeSGPR = ZeroExpr;
// Count the number of user SGPRs implied from the enabled feature bits.
unsigned ImpliedUserSGPRCount = 0;
@@ -5425,8 +5455,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
// Track if the asm explicitly contains the directive for the user SGPR
// count.
std::optional<unsigned> ExplicitUserSGPRCount;
- bool ReserveVCC = true;
- bool ReserveFlatScr = true;
+ const MCExpr *ReserveVCC = OneExpr;
+ const MCExpr *ReserveFlatScr = OneExpr;
std::optional<bool> EnableWavefrontSize32;
while (true) {
@@ -5620,34 +5650,29 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, ExprVal,
ValRange);
} else if (ID == ".amdhsa_next_free_vgpr") {
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
VGPRRange = ValRange;
- NextFreeVGPR = Val;
+ NextFreeVGPR = ExprVal;
} else if (ID == ".amdhsa_next_free_sgpr") {
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
SGPRRange = ValRange;
- NextFreeSGPR = Val;
+ NextFreeSGPR = ExprVal;
} else if (ID == ".amdhsa_accum_offset") {
if (!isGFX90A())
return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
- AccumOffset = Val;
+ AccumOffset = ExprVal;
} else if (ID == ".amdhsa_reserve_vcc") {
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
- if (!isUInt<1>(Val))
+ if (EvaluatableExpr && !isUInt<1>(Val))
return OutOfRangeError(ValRange);
- ReserveVCC = Val;
+ ReserveVCC = ExprVal;
} else if (ID == ".amdhsa_reserve_flat_scratch") {
- EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
if (IVersion.Major < 7)
return Error(IDRange.Start, "directive requires gfx7+", IDRange);
if (hasArchitectedFlatScratch())
return Error(IDRange.Start,
"directive is not supported with architected flat scratch",
IDRange);
- if (!isUInt<1>(Val))
+ if (EvaluatableExpr && !isUInt<1>(Val))
return OutOfRangeError(ValRange);
- ReserveFlatScr = Val;
+ ReserveFlatScr = ExprVal;
} else if (ID == ".amdhsa_reserve_xnack_mask") {
if (IVersion.Major < 8)
return Error(IDRange.Start, "directive requires gfx8+", IDRange);
@@ -5771,8 +5796,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (!Seen.contains(".amdhsa_next_free_sgpr"))
return TokError(".amdhsa_next_free_sgpr directive is required");
- unsigned VGPRBlocks;
- unsigned SGPRBlocks;
+ const MCExpr *VGPRBlocks;
+ const MCExpr *SGPRBlocks;
if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
getTargetStreamer().getTargetID()->isXnackOnOrAny(),
EnableWavefrontSize32, NextFreeVGPR,
@@ -5780,19 +5805,26 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
SGPRBlocks))
return true;
- if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
- VGPRBlocks))
+ int64_t EvaluatedVGPRBlocks;
+ bool VGPRBlocksEvaluatable =
+ VGPRBlocks->evaluateAsAbsolute(EvaluatedVGPRBlocks);
+ if (VGPRBlocksEvaluatable &&
+ !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
+ static_cast<uint64_t>(EvaluatedVGPRBlocks))) {
return OutOfRangeError(VGPRRange);
+ }
AMDGPU::MCKernelDescriptor::bits_set(
- KD.compute_pgm_rsrc1, MCConstantExpr::create(VGPRBlocks, getContext()),
+ KD.compute_pgm_rsrc1, VGPRBlocks,
COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT,
COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, getContext());
- if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
- SGPRBlocks))
+ int64_t EvaluatedSGPRBlocks;
+ if (SGPRBlocks->evaluateAsAbsolute(EvaluatedSGPRBlocks) &&
+ !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
+ static_cast<uint64_t>(EvaluatedSGPRBlocks)))
return OutOfRangeError(SGPRRange);
AMDGPU::MCKernelDescriptor::bits_set(
- KD.compute_pgm_rsrc1, MCConstantExpr::create(SGPRBlocks, getContext()),
+ KD.compute_pgm_rsrc1, SGPRBlocks,
COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT,
COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, getContext());
@@ -5822,16 +5854,28 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (isGFX90A()) {
if (!Seen.contains(".amdhsa_accum_offset"))
return TokError(".amdhsa_accum_offset directive is required");
- if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3))
+ int64_t EvaluatedAccum;
+ bool AccumEvaluatable = AccumOffset->evaluateAsAbsolute(EvaluatedAccum);
+ uint64_t UEvaluatedAccum = EvaluatedAccum;
+ if (AccumEvaluatable &&
+ (UEvaluatedAccum < 4 || UEvaluatedAccum > 256 || (UEvaluatedAccum & 3)))
return TokError("accum_offset should be in range [4..256] in "
"increments of 4");
- if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4))
+
+ int64_t EvaluatedNumVGPR;
+ if (NextFreeVGPR->evaluateAsAbsolute(EvaluatedNumVGPR) &&
+ AccumEvaluatable &&
+ UEvaluatedAccum >
+ alignTo(std::max((uint64_t)1, (uint64_t)EvaluatedNumVGPR), 4))
return TokError("accum_offset exceeds total VGPR allocation");
- MCKernelDescriptor::bits_set(
- KD.compute_pgm_rsrc3,
- MCConstantExpr::create(AccumOffset / 4 - 1, getContext()),
- COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
- COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, getContext());
+ const MCExpr *AdjustedAccum = MCBinaryExpr::createSub(
+ MCBinaryExpr::createDiv(
+ AccumOffset, MCConstantExpr::create(4, getContext()), getContext()),
+ MCConstantExpr::create(1, getContext()), getContext());
+ MCKernelDescriptor::bits_set(KD.compute_pgm_rsrc3, AdjustedAccum,
+ COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+ COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+ getContext());
}
if (IVersion.Major >= 10 && IVersion.Major < 12) {
@@ -5840,7 +5884,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return TokError("shared_vgpr_count directive not valid on "
"wavefront size 32");
}
- if (SharedVGPRCount * 2 + VGPRBlocks > 63) {
+
+ if (VGPRBlocksEvaluatable &&
+ (SharedVGPRCount * 2 + static_cast<uint64_t>(EvaluatedVGPRBlocks) >
+ 63)) {
return TokError("shared_vgpr_count*2 + "
"compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot "
"exceed 63\n");
@@ -8353,7 +8400,7 @@ void AMDGPUAsmParser::onBeginOfFile() {
/// max(expr, ...)
///
bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
- using AGVK = AMDGPUVariadicMCExpr::VariadicKind;
+ using AGVK = AMDGPUMCExpr::VariantKind;
if (isToken(AsmToken::Identifier)) {
StringRef TokenId = getTokenStr();
@@ -8383,7 +8430,7 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
"mismatch of commas in " + Twine(TokenId) + " expression");
return true;
}
- Res = AMDGPUVariadicMCExpr::create(VK, Exprs, getContext());
+ Res = AMDGPUMCExpr::create(VK, Exprs, getContext());
return false;
}
const MCExpr *Expr;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index b05834e5803a..3b8d94b74400 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -399,12 +399,10 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> {
RegisterOperand tfeVDataOp =
- !if(!eq(RC.Size, 32), AVLdSt_64,
- !if(!eq(RC.Size, 64), AVLdSt_96,
- !if(!eq(RC.Size, 96), AVLdSt_128,
- !if(!eq(RC.Size, 128), AVLdSt_160,
- RegisterOperand<VReg_1> // Invalid register.
- ))));
+ !cond(!eq(RC.Size, 32) : AVLdSt_64,
+ !eq(RC.Size, 64) : AVLdSt_96,
+ !eq(RC.Size, 96) : AVLdSt_128,
+ !eq(RC.Size, 128) : AVLdSt_160);
RegisterOperand ret = !if(isTFE, tfeVDataOp, getLdStRegisterOperand<RC>.ret);
}
@@ -534,7 +532,7 @@ multiclass MUBUF_Pseudo_Load_Pats_Common<string BaseInst, ValueType load_vt = i3
}
multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag>{
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst, load_vt, ld>;
}
defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst # "_VBUFFER", load_vt, ld>;
@@ -631,7 +629,7 @@ multiclass MUBUF_Pseudo_Store_Pats_Common<string BaseInst, ValueType store_vt =
}
multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst, store_vt, st>;
}
defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst # "_VBUFFER", store_vt, st>;
@@ -1151,27 +1149,21 @@ let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
"buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
"buffer_atomic_fmin", VGPR_32, f32, null_frag
>;
defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
"buffer_atomic_fmax", VGPR_32, f32, null_frag
>;
-
}
let SubtargetPredicate = isGFX6GFX7GFX10 in {
-
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
>;
-defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmin_x2", VReg_64, f64, null_frag
->;
-defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmax_x2", VReg_64, f64, null_frag
->;
-
}
let SubtargetPredicate = HasD16LoadStore in {
@@ -1235,12 +1227,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16
>;
-let OtherPredicates = [HasAtomicFaddRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
"buffer_atomic_add_f32", VGPR_32, f32, null_frag
>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag
>;
@@ -1249,7 +1241,9 @@ let SubtargetPredicate = isGFX12Plus in {
defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics <
"buffer_atomic_cond_sub_u32", VGPR_32, i32
>;
+}
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
let FPAtomic = 1 in
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
"buffer_atomic_pk_add_bf16", VGPR_32, v2bf16
@@ -1320,6 +1314,9 @@ let SubtargetPredicate = isGFX90APlus in {
let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
+
+ // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
+ // depending on some subtargets.
defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
@@ -1421,18 +1418,22 @@ let OtherPredicates = [HasPackedD16VMem] in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORD">;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX2">;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX3">;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX4">;
+}
+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
@@ -1495,6 +1496,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
@@ -1521,18 +1523,22 @@ let OtherPredicates = [HasPackedD16VMem] in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORD">;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX2">;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX3">;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX4">;
+}
+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
@@ -1545,7 +1551,7 @@ multiclass BufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, bi
defvar Op = !cast<SDPatternOperator>(OpPrefix
# !if(!eq(RtnMode, "ret"), "", "_noret")
- # !if(isIntr, "", "_" # vt.Size));
+ # !if(isIntr, "", "_" # vt));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
@@ -1582,7 +1588,7 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string
defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global"
# !if(!eq(RtnMode, "ret"), "", "_noret")
- # "_" # vt.Size);
+ # "_" # vt);
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass;
@@ -1641,6 +1647,16 @@ defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" #
} // end foreach Ty
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
+defm : BufferAtomicPat<"atomic_load_fmin_global", f32, "BUFFER_ATOMIC_FMIN">;
+defm : BufferAtomicPat<"atomic_load_fmax_global", f32, "BUFFER_ATOMIC_FMAX">;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+defm : BufferAtomicPat<"atomic_load_fmin_global", f64, "BUFFER_ATOMIC_MIN_F64">;
+defm : BufferAtomicPat<"atomic_load_fmax_global", f64, "BUFFER_ATOMIC_MAX_F64">;
+}
+
defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">;
defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">;
@@ -1695,9 +1711,11 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,
multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
list<string> RtnModes = ["ret", "noret"]> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst, RtnModes>;
}
+
+ // FIXME: This needs a !HasUnrestrictedSOffset predicate
defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", RtnModes>;
}
@@ -1728,24 +1746,29 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
-let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
+let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16">;
+}
+
let SubtargetPredicate = isGFX12Plus in {
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>;
+}
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>;
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in {
+defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>;
}
-let OtherPredicates = [isGFX6GFX7GFX10Plus] in {
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
}
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
- defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_FMIN_X2">;
- defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_FMAX_X2">;
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
}
class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
@@ -1799,33 +1822,28 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
defm : BufferAtomicPatterns_NO_RTN_Common<name, vt, opcode # "_VBUFFER">;
}
-let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
- let SubtargetPredicate = isGFX9Only in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
-
- let SubtargetPredicate = isGFX12Plus in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["noret"]>;
-} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts]
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in {
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
+} // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts
-let OtherPredicates = [HasAtomicFaddRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
- let SubtargetPredicate = isGFX9Only in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
-
- let SubtargetPredicate = isGFX12Plus in
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["ret"]>;
-} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts]
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in {
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
+} // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts
-let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
+let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
+} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} //End let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts
multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> {
foreach RtnMode = ["ret", "noret"] in {
@@ -1897,7 +1915,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
}
multiclass SIBufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst>;
}
defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst # "_VBUFFER">;
@@ -1948,7 +1966,7 @@ multiclass MUBUFLoad_PatternOffset_Common <string Instr, ValueType vt,
multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt,
PatFrag ld> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MUBUFLoad_PatternOffset_Common<Instr, vt, ld>;
}
defm : MUBUFLoad_PatternOffset_Common<Instr # "_VBUFFER", vt, ld>;
@@ -2189,7 +2207,7 @@ multiclass MTBUF_LoadIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode, memoryVt>;
}
defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>;
@@ -2204,7 +2222,7 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
-let OtherPredicates = [HasUnpackedD16VMem] in {
+let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
@@ -2212,7 +2230,7 @@ let OtherPredicates = [HasUnpackedD16VMem] in {
defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
-let OtherPredicates = [HasPackedD16VMem] in {
+let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
@@ -2261,7 +2279,7 @@ multiclass MTBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- let SubtargetPredicate = HasUnrestrictedSOffset in {
+ let OtherPredicates = [HasUnrestrictedSOffset] in {
defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode, memoryVt>;
}
defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>;
@@ -2276,7 +2294,7 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY"
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
-let OtherPredicates = [HasUnpackedD16VMem] in {
+let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
@@ -2284,7 +2302,7 @@ let OtherPredicates = [HasUnpackedD16VMem] in {
defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
-let OtherPredicates = [HasPackedD16VMem] in {
+let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
@@ -2296,6 +2314,12 @@ let OtherPredicates = [HasPackedD16VMem] in {
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
+// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the
+// specific pseudo (bothen in this case) since any of them will work.
+class get_BUF_ps<string name> {
+ string Mnemonic = !cast<BUF_Pseudo>(name # "_OFFSET").Mnemonic;
+}
+
//===----------------------------------------------------------------------===//
// Base ENC_MUBUF for GFX6, GFX7, GFX10, GFX11.
//===----------------------------------------------------------------------===//
@@ -2327,8 +2351,8 @@ multiclass MUBUF_Real_gfx11<bits<8> op, string real_name = !cast<MUBUF_Pseudo>(N
}
}
-class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
- Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> {
+class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef, string asmName> :
+ Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef, asmName> {
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
@@ -2338,9 +2362,10 @@ class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
let Inst{55} = ps.tfe;
}
-multiclass MUBUF_Real_gfx10<bits<8> op> {
- defvar ps = !cast<MUBUF_Pseudo>(NAME);
- def _gfx10 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> {
+multiclass MUBUF_Real_gfx10<bits<8> op, string psName = NAME,
+ string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> {
+ defvar ps = !cast<MUBUF_Pseudo>(psName);
+ def _gfx10 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10, asmName> {
let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
let Inst{25} = op{7};
let AssemblerPredicate = isGFX10Only;
@@ -2348,9 +2373,10 @@ multiclass MUBUF_Real_gfx10<bits<8> op> {
}
}
-multiclass MUBUF_Real_gfx6_gfx7<bits<8> op> {
- defvar ps = !cast<MUBUF_Pseudo>(NAME);
- def _gfx6_gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> {
+multiclass MUBUF_Real_gfx6_gfx7<bits<8> op, string psName = NAME,
+ string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> {
+ defvar ps = !cast<MUBUF_Pseudo>(psName);
+ def _gfx6_gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, asmName> {
let Inst{15} = ps.addr64;
let AssemblerPredicate = isGFX6GFX7;
let DecoderNamespace = "GFX6GFX7";
@@ -2359,7 +2385,7 @@ multiclass MUBUF_Real_gfx6_gfx7<bits<8> op> {
multiclass MUBUF_Real_gfx6<bits<8> op> {
defvar ps = !cast<MUBUF_Pseudo>(NAME);
- def _gfx6 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> {
+ def _gfx6 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, ps.Mnemonic> {
let Inst{15} = ps.addr64;
let AssemblerPredicate = isGFX6;
let DecoderNamespace = "GFX6";
@@ -2368,7 +2394,7 @@ multiclass MUBUF_Real_gfx6<bits<8> op> {
multiclass MUBUF_Real_gfx7<bits<8> op> {
defvar ps = !cast<MUBUF_Pseudo>(NAME);
- def _gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> {
+ def _gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, ps.Mnemonic> {
let Inst{15} = ps.addr64;
let AssemblerPredicate = isGFX7Only;
let DecoderNamespace = "GFX7";
@@ -2445,9 +2471,15 @@ class VBUFFER_Real_gfx12<bits<8> op, BUF_Pseudo ps, string real_name> :
multiclass VBUFFER_MUBUF_Real_gfx12<bits<8> op, string real_name> {
defvar ps = !cast<MUBUF_Pseudo>(NAME);
def _gfx12 : VBUFFER_Real_gfx12<op, ps, real_name> {
- // Set the last bit of format to 1 to avoid round-trip issues, as some tools
+ // Set the format field to be 1 to avoid round-trip issues, as some tools
// print BUF_FMT_INVALID for format 0.
- let Inst{55} = 0b1;
+ let Inst{61-55} = 0b0000001;
+ }
+ // Have a version of the instruction to disassemble to for any other
+ // format field values.
+ def _gfx12_format : VBUFFER_Real<op, ps, real_name> {
+ let AsmVariantName = "NonParsable";
+ let DecoderNamespace = "GFX12";
}
}
@@ -2463,12 +2495,6 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> {
// MUBUF - GFX11, GFX12.
//===----------------------------------------------------------------------===//
-// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the
-// specific pseudo (bothen in this case) since any of them will work.
-class get_BUF_ps<string name> {
- string Mnemonic = !cast<BUF_Pseudo>(name # "_BOTHEN").Mnemonic;
-}
-
// gfx11 instruction that accept both old and new assembler name.
class Mnem_gfx11_gfx12 <string mnemonic, string real_name> :
AMDGPUMnemonicAlias<mnemonic, real_name> {
@@ -2690,18 +2716,20 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op, bit isTFE = 0> {
defm _LDS_BOTHEN : MUBUF_Real_gfx10<op>;
}
}
-multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> {
- defm _BOTHEN_RTN : MUBUF_Real_gfx10<op>;
- defm _IDXEN_RTN : MUBUF_Real_gfx10<op>;
- defm _OFFEN_RTN : MUBUF_Real_gfx10<op>;
- defm _OFFSET_RTN : MUBUF_Real_gfx10<op>;
+multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op, string psName = NAME,
+ string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> {
+ defm _BOTHEN_RTN : MUBUF_Real_gfx10<op, psName#"_BOTHEN_RTN", asmName>;
+ defm _IDXEN_RTN : MUBUF_Real_gfx10<op, psName#"_IDXEN_RTN", asmName>;
+ defm _OFFEN_RTN : MUBUF_Real_gfx10<op, psName#"_OFFEN_RTN", asmName>;
+ defm _OFFSET_RTN : MUBUF_Real_gfx10<op, psName#"_OFFSET_RTN", asmName>;
}
-multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
- MUBUF_Real_Atomics_RTN_gfx10<op> {
- defm _BOTHEN : MUBUF_Real_gfx10<op>;
- defm _IDXEN : MUBUF_Real_gfx10<op>;
- defm _OFFEN : MUBUF_Real_gfx10<op>;
- defm _OFFSET : MUBUF_Real_gfx10<op>;
+multiclass MUBUF_Real_Atomics_gfx10<bits<8> op, string psName = NAME,
+ string asmName = get_BUF_ps<psName>.Mnemonic> :
+ MUBUF_Real_Atomics_RTN_gfx10<op, psName, asmName> {
+ defm _BOTHEN : MUBUF_Real_gfx10<op, psName#"_BOTHEN", asmName>;
+ defm _IDXEN : MUBUF_Real_gfx10<op, psName#"_IDXEN", asmName>;
+ defm _OFFEN : MUBUF_Real_gfx10<op, psName#"_OFFEN", asmName>;
+ defm _OFFSET : MUBUF_Real_gfx10<op, psName#"_OFFSET", asmName>;
}
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
@@ -2756,18 +2784,18 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op, bit isTFE = 0> {
defm _LDS_BOTHEN : MUBUF_Real_gfx6_gfx7<op>;
}
}
-multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> {
- defm _ADDR64 : MUBUF_Real_gfx6_gfx7<op>;
- defm _BOTHEN : MUBUF_Real_gfx6_gfx7<op>;
- defm _IDXEN : MUBUF_Real_gfx6_gfx7<op>;
- defm _OFFEN : MUBUF_Real_gfx6_gfx7<op>;
- defm _OFFSET : MUBUF_Real_gfx6_gfx7<op>;
+multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op, string psName, string asmName> {
+ defm _ADDR64 : MUBUF_Real_gfx6_gfx7<op, psName#"_ADDR64", asmName>;
+ defm _BOTHEN : MUBUF_Real_gfx6_gfx7<op, psName#"_BOTHEN", asmName>;
+ defm _IDXEN : MUBUF_Real_gfx6_gfx7<op, psName#"_IDXEN", asmName>;
+ defm _OFFEN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFEN", asmName>;
+ defm _OFFSET : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFSET", asmName>;
- defm _ADDR64_RTN : MUBUF_Real_gfx6_gfx7<op>;
- defm _BOTHEN_RTN : MUBUF_Real_gfx6_gfx7<op>;
- defm _IDXEN_RTN : MUBUF_Real_gfx6_gfx7<op>;
- defm _OFFEN_RTN : MUBUF_Real_gfx6_gfx7<op>;
- defm _OFFSET_RTN : MUBUF_Real_gfx6_gfx7<op>;
+ defm _ADDR64_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_ADDR64_RTN", asmName>;
+ defm _BOTHEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_BOTHEN_RTN", asmName>;
+ defm _IDXEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_IDXEN_RTN", asmName>;
+ defm _OFFEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFEN_RTN", asmName>;
+ defm _OFFSET_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFSET_RTN", asmName>;
}
multiclass MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<8> op> :
@@ -2782,8 +2810,10 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<bits<8> op> {
defm _TFE : MUBUF_Real_AllAddr_Lds_Helper_gfx6_gfx7_gfx10<op, 1>;
}
-multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op> :
- MUBUF_Real_Atomics_gfx6_gfx7<op>, MUBUF_Real_Atomics_gfx10<op>;
+multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op, string psName = NAME,
+ string asmName = get_BUF_ps<psName>.Mnemonic> :
+ MUBUF_Real_Atomics_gfx6_gfx7<op, psName, asmName>,
+ MUBUF_Real_Atomics_gfx10<op, psName, asmName>;
// FIXME-GFX6: Following instructions are available only on GFX6.
//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomics_gfx6 <0x034>;
@@ -2843,8 +2873,8 @@ defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>;
// FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7.
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
-defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
-defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
+defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f, "BUFFER_ATOMIC_MIN_F64", "buffer_atomic_fmin_x2">;
+defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060, "BUFFER_ATOMIC_MAX_F64", "buffer_atomic_fmax_x2">;
defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_gfx10<0x034>;
@@ -3066,9 +3096,9 @@ multiclass MUBUF_Real_vi_gfx90a<bits<7> op, bit isTFE = 0> : MUBUF_Real_vi<op> {
}
if ps.FPAtomic then {
- let SubtargetPredicate = isGFX90AOnly,
- AssemblerPredicate = isGFX90AOnly in
- defm NAME : MUBUF_Real_gfx90a<op, 0>;
+ let AssemblerPredicate = isGFX90AOnly in
+ defm NAME : MUBUF_Real_gfx90a<op, 0>;
+
def _gfx940 : MUBUF_Real_gfx940<op, ps>;
}
}
@@ -3251,10 +3281,7 @@ defm BUFFER_WBINVL1_VOL : MUBUF_Real_vi <0x3f>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>;
-
-let SubtargetPredicate = HasAtomicFaddNoRtnInsts in {
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>;
-} // End SubtargetPredicate = HasAtomicFaddNoRtnInsts
let SubtargetPredicate = isGFX90APlus in {
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 19bb4300531c..219246b71fe8 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -965,16 +965,16 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt)>;
}
let OtherPredicates = [HasGDS] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
}
}
@@ -983,24 +983,24 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_m0_"#vt)>;
def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), /* complexity */ 1>;
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt)>;
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
}
let OtherPredicates = [HasGDS] in {
def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ !cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
/* complexity */ 1, /* gds */ 1>;
}
}
@@ -1019,23 +1019,23 @@ class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag,
multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt,
string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
- def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size),
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt),
/* complexity */ 1>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt)>;
def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size),
+ !cast<PatFrag>(frag#"_local_noret_"#vt),
/* complexity */ 1>;
}
let OtherPredicates = [HasGDS] in {
- def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
- def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
/* complexity */ 1, /* gds */ 1>;
}
}
@@ -1053,14 +1053,14 @@ class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag,
multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> {
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt)>;
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
let OtherPredicates = [HasGDS] in {
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
- def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
/* complexity */ 1, /* gds */ 1>;
}
}
@@ -1082,6 +1082,12 @@ defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U32, DS_MAX_U32, i32, "atomic_load_umax
defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F32, DS_MIN_F32, f32, "atomic_load_fmin">;
defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax">;
+
+let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
+defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
+}
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
@@ -1119,9 +1125,9 @@ defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp
} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = HasLdsAtomicAddF64 in {
-def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>;
+def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_f64>;
let AddedComplexity = 1 in
-def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
+def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_f64>;
class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag,
bit gds=0> : GCNPat <
@@ -1135,18 +1141,7 @@ def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret
}
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
-def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
-let AddedComplexity = 1 in
-def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>;
-def : GCNPat <
- (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
- (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
->;
-let AddedComplexity = 1 in
-def : GCNPat <
- (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
- (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
->;
+defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
let OtherPredicates = [HasGDS] in
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 05063c6c321a..76a559c9443b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -21,6 +21,7 @@
#include "SIDefines.h"
#include "SIRegisterInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm-c/DisassemblerTypes.h"
#include "llvm/BinaryFormat/ELF.h"
@@ -52,6 +53,13 @@ AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
// ToDo: AMDGPUDisassembler supports only VI ISA.
if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
report_fatal_error("Disassembly not yet supported for subtarget");
+
+ for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions())
+ createConstantSymbolExpr(Symbol, Code);
+
+ UCVersionW64Expr = createConstantSymbolExpr("UC_VERSION_W64_BIT", 0x2000);
+ UCVersionW32Expr = createConstantSymbolExpr("UC_VERSION_W32_BIT", 0x4000);
+ UCVersionMDPExpr = createConstantSymbolExpr("UC_VERSION_MDP_BIT", 0x8000);
}
void AMDGPUDisassembler::setABIVersion(unsigned Version) {
@@ -421,6 +429,13 @@ DECODE_SDWA(Src32)
DECODE_SDWA(Src16)
DECODE_SDWA(VopcDst)
+static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
+ uint64_t /* Addr */,
+ const MCDisassembler *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(Inst, DAsm->decodeVersionImm(Imm));
+}
+
#include "AMDGPUGenDisassemblerTables.inc"
//===----------------------------------------------------------------------===//
@@ -1727,6 +1742,41 @@ MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const {
return MCOperand::createImm(Val);
}
+MCOperand AMDGPUDisassembler::decodeVersionImm(unsigned Imm) const {
+ using VersionField = AMDGPU::EncodingField<7, 0>;
+ using W64Bit = AMDGPU::EncodingBit<13>;
+ using W32Bit = AMDGPU::EncodingBit<14>;
+ using MDPBit = AMDGPU::EncodingBit<15>;
+ using Encoding = AMDGPU::EncodingFields<VersionField, W64Bit, W32Bit, MDPBit>;
+
+ auto [Version, W64, W32, MDP] = Encoding::decode(Imm);
+
+ // Decode into a plain immediate if any unused bits are raised.
+ if (Encoding::encode(Version, W64, W32, MDP) != Imm)
+ return MCOperand::createImm(Imm);
+
+ const auto &Versions = AMDGPU::UCVersion::getGFXVersions();
+ auto I = find_if(Versions,
+ [Version = Version](const AMDGPU::UCVersion::GFXVersion &V) {
+ return V.Code == Version;
+ });
+ MCContext &Ctx = getContext();
+ const MCExpr *E;
+ if (I == Versions.end())
+ E = MCConstantExpr::create(Version, Ctx);
+ else
+ E = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(I->Symbol), Ctx);
+
+ if (W64)
+ E = MCBinaryExpr::createOr(E, UCVersionW64Expr, Ctx);
+ if (W32)
+ E = MCBinaryExpr::createOr(E, UCVersionW32Expr, Ctx);
+ if (MDP)
+ E = MCBinaryExpr::createOr(E, UCVersionMDPExpr, Ctx);
+
+ return MCOperand::createExpr(E);
+}
+
bool AMDGPUDisassembler::isVI() const {
return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
}
@@ -2312,6 +2362,15 @@ Expected<bool> AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol,
return false;
}
+const MCExpr *AMDGPUDisassembler::createConstantSymbolExpr(StringRef Id,
+ int64_t Val) {
+ MCContext &Ctx = getContext();
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(Id);
+ assert(!Sym->isVariable());
+ Sym->setVariableValue(MCConstantExpr::create(Val, Ctx));
+ return MCSymbolRefExpr::create(Sym, Ctx);
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 2061d83af3da..694cd7a9bfd2 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -102,6 +102,11 @@ private:
mutable bool HasLiteral;
mutable std::optional<bool> EnableWavefrontSize32;
unsigned CodeObjectVersion;
+ const MCExpr *UCVersionW64Expr;
+ const MCExpr *UCVersionW32Expr;
+ const MCExpr *UCVersionMDPExpr;
+
+ const MCExpr *createConstantSymbolExpr(StringRef Id, int64_t Val);
public:
AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
@@ -264,6 +269,8 @@ public:
MCOperand decodeSplitBarrier(unsigned Val) const;
MCOperand decodeDpp8FI(unsigned Val) const;
+ MCOperand decodeVersionImm(unsigned Imm) const;
+
int getTTmpIdx(unsigned Val) const;
const MCInstrInfo *getMCII() const { return MCII.get(); }
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 3767dd0b6d47..280def5440c8 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -322,25 +322,25 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$
$ptr), sub1)>;
defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_NORET,
- atomic_swap_global_noret_32>;
+ atomic_swap_global_noret_i32>;
defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_NORET,
- atomic_load_add_global_noret_32>;
+ atomic_load_add_global_noret_i32>;
defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_NORET,
- atomic_load_sub_global_noret_32>;
+ atomic_load_sub_global_noret_i32>;
defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_NORET,
- atomic_load_min_global_noret_32>;
+ atomic_load_min_global_noret_i32>;
defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_NORET,
- atomic_load_umin_global_noret_32>;
+ atomic_load_umin_global_noret_i32>;
defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_NORET,
- atomic_load_max_global_noret_32>;
+ atomic_load_max_global_noret_i32>;
defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_NORET,
- atomic_load_umax_global_noret_32>;
+ atomic_load_umax_global_noret_i32>;
defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_NORET,
- atomic_load_and_global_noret_32>;
+ atomic_load_and_global_noret_i32>;
defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_NORET,
- atomic_load_or_global_noret_32>;
+ atomic_load_or_global_noret_i32>;
defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_NORET,
- atomic_load_xor_global_noret_32>;
+ atomic_load_xor_global_noret_i32>;
// Should be predicated on FeatureFP64
// def FMA_64 : R600_3OP <
@@ -712,37 +712,37 @@ def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
[(truncstorei16_local i32:$src1, i32:$src0)]
>;
def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
- [(set i32:$dst, (atomic_load_add_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_add_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
- [(set i32:$dst, (atomic_load_sub_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_sub_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
- [(set i32:$dst, (atomic_load_and_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_and_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
- [(set i32:$dst, (atomic_load_or_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_or_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
- [(set i32:$dst, (atomic_load_xor_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_xor_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
- [(set i32:$dst, (atomic_load_min_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_min_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
- [(set i32:$dst, (atomic_load_max_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_max_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
- [(set i32:$dst, (atomic_load_umin_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_umin_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
- [(set i32:$dst, (atomic_load_umax_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_umax_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
- [(set i32:$dst, (atomic_swap_local_32 i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_swap_local_i32 i32:$src0, i32:$src1))]
>;
def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
- [(set i32:$dst, (atomic_cmp_swap_local_32 i32:$src0, i32:$src1, i32:$src2))]
+ [(set i32:$dst, (atomic_cmp_swap_local_i32 i32:$src0, i32:$src1, i32:$src2))]
>;
def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
[(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))]
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index aab19b8adc27..98054dde398b 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -752,25 +752,29 @@ defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
// GFX7-, GFX10-only flat instructions.
let SubtargetPredicate = isGFX7GFX10 in {
-
defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
VReg_64, f64, v2f64, VReg_128>;
+} // End SubtargetPredicate = isGFX7GFX10
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2",
- VReg_64, f64>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
- VReg_64, f64>;
+// The names may be flat_atomic_fmin_x2 on some subtargets, but we
+// choose this as the canonical name.
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo <"flat_atomic_min_f64",
+ VReg_64, f64>;
-} // End SubtargetPredicate = isGFX7GFX10
+defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo <"flat_atomic_max_f64",
+ VReg_64, f64>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
+defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
+}
let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
- defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
- defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
- defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
- defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in {
@@ -972,6 +976,15 @@ defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_s
defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
let SubtargetPredicate = isGFX12Plus in {
+ let Uses = [EXEC, M0] in {
+ defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
+ defm GLOBAL_STORE_BLOCK : FLAT_Global_Store_Pseudo <"global_store_block", VReg_1024>;
+ }
+ let Uses = [EXEC, FLAT_SCR, M0] in {
+ defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VReg_1024>;
+ defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VReg_1024>;
+ }
+
let WaveSizePredicate = isWave32 in {
let Mnemonic = "global_load_tr_b128" in
defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
@@ -995,10 +1008,6 @@ let SubtargetPredicate = isGFX10Plus in {
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>;
- defm GLOBAL_ATOMIC_FMIN_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
- defm GLOBAL_ATOMIC_FMAX_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
} // End SubtargetPredicate = isGFX10Plus
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
@@ -1105,7 +1114,7 @@ multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addr
multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
- FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt.Size), vt, data_vt>;
+ FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
@@ -1123,7 +1132,7 @@ multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSp
multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
- FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt.Size), vt, data_vt>;
+ FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>;
multiclass FlatAtomicPat <string inst, string node, ValueType vt,
@@ -1155,8 +1164,8 @@ class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, int complexity = 0,
bit isIntr = 0> {
- defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size));
- defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
+ defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt));
+ defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt));
let AddedComplexity = complexity in
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
@@ -1165,21 +1174,6 @@ multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>;
}
-multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt,
- ValueType data_vt = vt> {
- defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* complexity */ 0, /* isIntr */ 1>;
-}
-
-multiclass FlatSignedAtomicPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
- ValueType vt, ValueType data_vt = vt> {
- defvar noRtnNode = !cast<PatFrags>(intr # "_noret_" # addrSpaceSuffix);
- defvar rtnNode = !cast<PatFrags>(intr # "_" # addrSpaceSuffix);
-
- let AddedComplexity = 1 in
- def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>;
- def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
-}
-
class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
(inst $vaddr, $offset)
@@ -1280,11 +1274,11 @@ multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
multiclass GlobalFLATAtomicPatsNoRtn<string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
- GlobalFLATAtomicPatsNoRtnBase<inst, node # "_noret" # !if(isIntr, "", "_" # vt.Size), vt, data_vt>;
+ GlobalFLATAtomicPatsNoRtnBase<inst, node # "_noret" # !if(isIntr, "", "_" # vt), vt, data_vt>;
multiclass GlobalFLATAtomicPatsRtn<string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
- GlobalFLATAtomicPatsRtnBase<inst, node # !if(isIntr, "", "_" # vt.Size), vt, data_vt>;
+ GlobalFLATAtomicPatsRtnBase<inst, node # !if(isIntr, "", "_" # vt), vt, data_vt>;
multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
@@ -1431,6 +1425,17 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_OR_X2", "atomic_load_or_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP_X2", "atomic_swap_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64, v2i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>;
+
+let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_"#as, f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_"#as, f32>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
+}
+
} // end foreach as
let SubtargetPredicate = isGFX12Plus in {
@@ -1592,37 +1597,26 @@ let OtherPredicates = [isGFX12Plus] in {
}
}
-let OtherPredicates = [isGFX10Plus] in {
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
-}
-
-let OtherPredicates = [isGFX10GFX11] in {
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
-
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
}
-let OtherPredicates = [isGFX10Only] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN_X2", "atomic_load_fmin_flat", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX_X2", "atomic_load_fmax_flat", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>;
+let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
}
let OtherPredicates = [isGFX12Only] in {
+ // FIXME: Remove these intrinsics
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
- defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
- defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
+ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
+ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
}
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
@@ -1645,37 +1639,44 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
}
-let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
-defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
+}
+
+let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
+defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
}
let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
-defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
-defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
}
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
-defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
-defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
}
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
-
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
@@ -1745,8 +1746,8 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
// CI
//===----------------------------------------------------------------------===//
-class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> :
- FLAT_Real <op, ps>,
+class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps, string asmName = ps.Mnemonic> :
+ FLAT_Real <op, ps, asmName>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> {
let AssemblerPredicate = isGFX7Only;
let DecoderNamespace="GFX7";
@@ -1768,10 +1769,13 @@ def FLAT_STORE_DWORDX2_ci : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>;
def FLAT_STORE_DWORDX4_ci : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>;
def FLAT_STORE_DWORDX3_ci : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>;
-multiclass FLAT_Real_Atomics_ci <bits<7> op> {
- defvar ps = !cast<FLAT_Pseudo>(NAME);
- def _ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
- def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+multiclass FLAT_Real_Atomics_ci <bits<7> op, string opName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(opName).Mnemonic> {
+ defvar ps = !cast<FLAT_Pseudo>(opName);
+ defvar ps_rtn = !cast<FLAT_Pseudo>(opName#"_RTN");
+
+ def _ci : FLAT_Real_ci<op, ps, asmName>;
+ def _RTN_ci : FLAT_Real_ci<op, ps_rtn, asmName>;
}
defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_ci <0x30>;
@@ -1806,8 +1810,8 @@ defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_ci <0x3e>;
defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_ci <0x3f>;
defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_ci <0x40>;
defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_ci <0x5e>;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f, "FLAT_ATOMIC_MIN_F64", "flat_atomic_fmin_x2">;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, "FLAT_ATOMIC_MAX_F64", "flat_atomic_fmax_x2">;
//===----------------------------------------------------------------------===//
@@ -2089,8 +2093,8 @@ let SubtargetPredicate = isGFX940Plus in {
// GFX10.
//===----------------------------------------------------------------------===//
-class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
- FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> {
+class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
+ FLAT_Real<op, ps, opName>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> {
let AssemblerPredicate = isGFX10Only;
let DecoderNamespace = "GFX10";
@@ -2102,25 +2106,28 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
let Inst{55} = 0;
}
-
-multiclass FLAT_Real_Base_gfx10<bits<7> op> {
+multiclass FLAT_Real_Base_gfx10<bits<7> op, string psName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(psName).Mnemonic> {
def _gfx10 :
- FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME)>;
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName), asmName>;
}
-multiclass FLAT_Real_RTN_gfx10<bits<7> op> {
+multiclass FLAT_Real_RTN_gfx10<bits<7> op, string psName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(psName).Mnemonic> {
def _RTN_gfx10 :
- FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_RTN"), asmName>;
}
-multiclass FLAT_Real_SADDR_gfx10<bits<7> op> {
+multiclass FLAT_Real_SADDR_gfx10<bits<7> op, string psName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(psName#"_SADDR").Mnemonic> {
def _SADDR_gfx10 :
- FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_SADDR"), asmName>;
}
-multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> {
+multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op, string psName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(psName#"_SADDR_RTN").Mnemonic> {
def _SADDR_RTN_gfx10 :
- FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_SADDR_RTN"), asmName>;
}
multiclass FLAT_Real_ST_gfx10<bits<7> op> {
@@ -2128,22 +2135,25 @@ multiclass FLAT_Real_ST_gfx10<bits<7> op> {
FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
}
-multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> :
- FLAT_Real_Base_gfx10<op>,
- FLAT_Real_SADDR_gfx10<op>;
+multiclass FLAT_Real_AllAddr_gfx10<bits<7> op, string OpName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> :
+ FLAT_Real_Base_gfx10<op, OpName, asmName>,
+ FLAT_Real_SADDR_gfx10<op, OpName, asmName>;
-multiclass FLAT_Real_Atomics_gfx10<bits<7> op> :
- FLAT_Real_Base_gfx10<op>,
- FLAT_Real_RTN_gfx10<op>;
+multiclass FLAT_Real_Atomics_gfx10<bits<7> op, string OpName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> :
+ FLAT_Real_Base_gfx10<op, OpName, asmName>,
+ FLAT_Real_RTN_gfx10<op, OpName, asmName>;
-multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> :
- FLAT_Real_AllAddr_gfx10<op>,
- FLAT_Real_RTN_gfx10<op>,
- FLAT_Real_SADDR_RTN_gfx10<op>;
+multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op, string OpName = NAME,
+ string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> :
+ FLAT_Real_AllAddr_gfx10<op, OpName, asmName>,
+ FLAT_Real_RTN_gfx10<op, OpName, asmName>,
+ FLAT_Real_SADDR_RTN_gfx10<op, OpName, asmName>;
-multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> :
- FLAT_Real_RTN_gfx10<op>,
- FLAT_Real_SADDR_RTN_gfx10<op>;
+multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op, string OpName = NAME> :
+ FLAT_Real_RTN_gfx10<op, OpName>,
+ FLAT_Real_SADDR_RTN_gfx10<op, OpName>;
multiclass FLAT_Real_ScratchAllAddr_gfx10<bits<7> op> :
FLAT_Real_Base_gfx10<op>,
@@ -2220,8 +2230,8 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_gfx10<0x05b>;
defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_gfx10<0x05c>;
defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_gfx10<0x05d>;
defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x05e>;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f, "FLAT_ATOMIC_MIN_F64", "flat_atomic_fmin_x2">;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060, "FLAT_ATOMIC_MAX_F64", "flat_atomic_fmax_x2">;
// ENC_FLAT_GLBL.
@@ -2278,8 +2288,8 @@ defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05b>;
defm GLOBAL_ATOMIC_INC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05c>;
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>;
-defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>;
-defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>;
+defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f, "GLOBAL_ATOMIC_MIN_F64", "global_atomic_fmin_x2">;
+defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060, "GLOBAL_ATOMIC_MAX_F64", "global_atomic_fmax_x2">;
defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x016>;
defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x017>;
@@ -2671,6 +2681,8 @@ defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_s
defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
+defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>;
+defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>;
defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
@@ -2741,3 +2753,6 @@ defm SCRATCH_LOAD_SBYTE_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x22, "scratch_
defm SCRATCH_LOAD_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x23, "scratch_load_d16_hi_b16">;
defm SCRATCH_STORE_BYTE_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x24, "scratch_store_d16_hi_b8">;
defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_store_d16_hi_b16">;
+
+defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
+defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 94d93390d091..217279211531 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -116,31 +116,112 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
<< ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
}
+/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
+/// current register pressure.
+///
+/// This works for the common case, but it has a few exceptions that have been
+/// observed through trial and error:
+/// - Explicit physical register operands
+/// - Subregister definitions
+///
+/// In both of those cases, PressureDiff doesn't represent the actual pressure,
+/// and querying LiveIntervals through the RegPressureTracker is needed to get
+/// an accurate value.
+///
+/// We should eventually only use PressureDiff for maximum performance, but this
+/// already allows 80% of SUs to take the fast path without changing scheduling
+/// at all. Further changes would either change scheduling, or require a lot
+/// more logic to recover an accurate pressure estimate from the PressureDiffs.
+static bool canUsePressureDiffs(const SUnit &SU) {
+ if (!SU.isInstr())
+ return false;
+
+ // Cannot use pressure diffs for subregister defs or with physregs, it's
+ // imprecise in both cases.
+ for (const auto &Op : SU.getInstr()->operands()) {
+ if (!Op.isReg() || Op.isImplicit())
+ continue;
+ if (Op.getReg().isPhysical() ||
+ (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
+ return false;
+ }
+ return true;
+}
+
+static void getRegisterPressures(bool AtTop,
+ const RegPressureTracker &RPTracker, SUnit *SU,
+ std::vector<unsigned> &Pressure,
+ std::vector<unsigned> &MaxPressure) {
+ // getDownwardPressure() and getUpwardPressure() make temporary changes to
+ // the tracker, so we need to pass those function a non-const copy.
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
+ if (AtTop)
+ TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ else
+ TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+}
+
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI,
unsigned SGPRPressure,
- unsigned VGPRPressure) {
+ unsigned VGPRPressure, bool IsBottomUp) {
Cand.SU = SU;
Cand.AtTop = AtTop;
if (!DAG->isTrackingPressure())
return;
- // getDownwardPressure() and getUpwardPressure() make temporary changes to
- // the tracker, so we need to pass those function a non-const copy.
- RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
-
Pressure.clear();
MaxPressure.clear();
- if (AtTop)
- TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
- else {
- // FIXME: I think for bottom up scheduling, the register pressure is cached
- // and can be retrieved by DAG->getPressureDif(SU).
- TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ // We try to use the cached PressureDiffs in the ScheduleDAG whenever
+ // possible over querying the RegPressureTracker.
+ //
+ // RegPressureTracker will make a lot of LIS queries which are very
+ // expensive, it is considered a slow function in this context.
+ //
+ // PressureDiffs are precomputed and cached, and getPressureDiff is just a
+ // trivial lookup into an array. It is pretty much free.
+ //
+ // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
+ // PressureDiffs.
+ if (AtTop || !canUsePressureDiffs(*SU)) {
+ getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
+ } else {
+ // Reserve 4 slots.
+ Pressure.resize(4, 0);
+ Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
+
+ for (const auto &Diff : DAG->getPressureDiff(SU)) {
+ if (!Diff.isValid())
+ continue;
+ // PressureDiffs is always bottom-up so if we're working top-down we need
+ // to invert its sign.
+ Pressure[Diff.getPSet()] +=
+ (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
+ }
+
+#ifdef EXPENSIVE_CHECKS
+ std::vector<unsigned> CheckPressure, CheckMaxPressure;
+ getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
+ if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
+ CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
+ Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
+ CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
+ errs() << "Register Pressure is inaccurate when calculated through "
+ "PressureDiff\n"
+ << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
+ << ", expected "
+ << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
+ << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
+ << ", expected "
+ << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
+ report_fatal_error("inaccurate register pressure calculation");
+ }
+#endif
}
unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
@@ -158,7 +239,6 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
-
// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
// to increase the likelihood we don't go over the limits. We should improve
// the analysis to look through dependencies to find the path with the least
@@ -207,7 +287,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand) {
+ SchedCandidate &Cand,
+ bool IsBottomUp) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
@@ -220,8 +301,8 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
for (SUnit *SU : Q) {
SchedCandidate TryCand(ZonePolicy);
- initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
- SGPRPressure, VGPRPressure);
+ initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
+ VGPRPressure, IsBottomUp);
// Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
tryCandidate(Cand, TryCand, ZoneArg);
@@ -262,7 +343,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
if (!BotCand.isValid() || BotCand.SU->isScheduled ||
BotCand.Policy != BotPolicy) {
BotCand.reset(CandPolicy());
- pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+ pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
+ /*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(BotCand));
@@ -270,7 +352,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
if (VerifyScheduling) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
- pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
+ pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
+ /*IsBottomUp=*/true);
assert(TCand.SU == BotCand.SU &&
"Last pick result should correspond to re-picking right now");
}
@@ -282,7 +365,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
if (!TopCand.isValid() || TopCand.SU->isScheduled ||
TopCand.Policy != TopPolicy) {
TopCand.reset(CandPolicy());
- pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+ pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
+ /*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(TopCand));
@@ -290,7 +374,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
if (VerifyScheduling) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
- pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
+ pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
+ /*IsBottomUp=*/false);
assert(TCand.SU == TopCand.SU &&
"Last pick result should correspond to re-picking right now");
}
@@ -327,7 +412,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
if (!SU) {
CandPolicy NoPolicy;
TopCand.reset(NoPolicy);
- pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
+ pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+ /*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find a candidate");
SU = TopCand.SU;
}
@@ -337,7 +423,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
if (!SU) {
CandPolicy NoPolicy;
BotCand.reset(NoPolicy);
- pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
+ pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
+ /*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find a candidate");
SU = BotCand.SU;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 2084aae4128f..f0aea2bc4ab8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -45,12 +45,12 @@ protected:
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand);
+ SchedCandidate &Cand, bool IsBottomUp);
- void initCandidate(SchedCandidate &Cand, SUnit *SU,
- bool AtTop, const RegPressureTracker &RPTracker,
- const SIRegisterInfo *SRI,
- unsigned SGPRPressure, unsigned VGPRPressure);
+ void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
+ const RegPressureTracker &RPTracker,
+ const SIRegisterInfo *SRI, unsigned SGPRPressure,
+ unsigned VGPRPressure, bool IsBottomUp);
std::vector<unsigned> Pressure;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index db5b467f2238..07ff855756ec 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -159,6 +159,10 @@ protected:
bool HasFP8Insts = false;
bool HasFP8ConversionInsts = false;
bool HasPkFmacF16Inst = false;
+ bool HasAtomicFMinFMaxF32GlobalInsts = false;
+ bool HasAtomicFMinFMaxF64GlobalInsts = false;
+ bool HasAtomicFMinFMaxF32FlatInsts = false;
+ bool HasAtomicFMinFMaxF64FlatInsts = false;
bool HasAtomicDsPkAdd16Insts = false;
bool HasAtomicFlatPkAdd16Insts = false;
bool HasAtomicFaddRtnInsts = false;
@@ -167,6 +171,7 @@ protected:
bool HasAtomicBufferGlobalPkAddF16Insts = false;
bool HasAtomicCSubNoRtnInsts = false;
bool HasAtomicGlobalPkAddBF16Inst = false;
+ bool HasAtomicBufferPkAddBF16Inst = false;
bool HasFlatAtomicFaddF32Inst = false;
bool HasDefaultComponentZero = false;
bool HasDefaultComponentBroadcast = false;
@@ -820,6 +825,22 @@ public:
return HasPkFmacF16Inst;
}
+ bool hasAtomicFMinFMaxF32GlobalInsts() const {
+ return HasAtomicFMinFMaxF32GlobalInsts;
+ }
+
+ bool hasAtomicFMinFMaxF64GlobalInsts() const {
+ return HasAtomicFMinFMaxF64GlobalInsts;
+ }
+
+ bool hasAtomicFMinFMaxF32FlatInsts() const {
+ return HasAtomicFMinFMaxF32FlatInsts;
+ }
+
+ bool hasAtomicFMinFMaxF64FlatInsts() const {
+ return HasAtomicFMinFMaxF64FlatInsts;
+ }
+
bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
@@ -844,6 +865,10 @@ public:
return HasAtomicGlobalPkAddBF16Inst;
}
+ bool hasAtomicBufferPkAddBF16Inst() const {
+ return HasAtomicBufferPkAddBF16Inst;
+ }
+
bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
@@ -1547,6 +1572,8 @@ public:
bool hasFlatScratchInit() const { return FlatScratchInit; }
+ bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
+
unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
@@ -1611,6 +1638,8 @@ private:
bool FlatScratchInit = false;
+ bool PrivateSegmentSize = false;
+
unsigned NumKernargPreloadSGPRs = 0;
unsigned NumUsedUserSGPRs = 0;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 883b6c4407fe..bb5de368810d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -43,7 +43,6 @@ void AMDGPUInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) const {
void AMDGPUInstPrinter::printInst(const MCInst *MI, uint64_t Address,
StringRef Annot, const MCSubtargetInfo &STI,
raw_ostream &OS) {
- OS.flush();
printInstruction(MI, Address, STI, OS);
printAnnotation(OS, Annot);
}
@@ -57,9 +56,15 @@ void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isExpr()) {
+ Op.getExpr()->print(O, &MAI);
+ return;
+ }
+
// It's possible to end up with a 32-bit literal used with a 16-bit operand
// with ignored high bits. Print as 32-bit anyway in that case.
- int64_t Imm = MI->getOperand(OpNo).getImm();
+ int64_t Imm = Op.getImm();
if (isInt<16>(Imm) || isUInt<16>(Imm))
O << formatHex(static_cast<uint64_t>(Imm & 0xffff));
else
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index fb93f45e3e87..b3cca91f6380 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -662,6 +662,11 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128(
void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+ int64_t Val;
+ if (MO.isExpr() && MO.getExpr()->evaluateAsAbsolute(Val)) {
+ Op = Val;
+ return;
+ }
if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
// FIXME: If this is expression is PCRel or not should not depend on what
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 159664faf983..83fbf4ac53d5 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -21,13 +21,11 @@
using namespace llvm;
using namespace llvm::AMDGPU;
-AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
- ArrayRef<const MCExpr *> Args,
- MCContext &Ctx)
+AMDGPUMCExpr::AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args,
+ MCContext &Ctx)
: Kind(Kind), Ctx(Ctx) {
assert(Args.size() >= 1 && "Needs a minimum of one expression.");
- assert(Kind != AGVK_None &&
- "Cannot construct AMDGPUVariadicMCExpr of kind none.");
+ assert(Kind != AGVK_None && "Cannot construct AMDGPUMCExpr of kind none.");
// Allocating the variadic arguments through the same allocation mechanism
// that the object itself is allocated with so they end up in the same memory.
@@ -40,25 +38,23 @@ AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
this->Args = ArrayRef<const MCExpr *>(RawArgs, Args.size());
}
-AMDGPUVariadicMCExpr::~AMDGPUVariadicMCExpr() { Ctx.deallocate(RawArgs); }
+AMDGPUMCExpr::~AMDGPUMCExpr() { Ctx.deallocate(RawArgs); }
-const AMDGPUVariadicMCExpr *
-AMDGPUVariadicMCExpr::create(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
- MCContext &Ctx) {
- return new (Ctx) AMDGPUVariadicMCExpr(Kind, Args, Ctx);
+const AMDGPUMCExpr *AMDGPUMCExpr::create(VariantKind Kind,
+ ArrayRef<const MCExpr *> Args,
+ MCContext &Ctx) {
+ return new (Ctx) AMDGPUMCExpr(Kind, Args, Ctx);
}
-const MCExpr *AMDGPUVariadicMCExpr::getSubExpr(size_t Index) const {
- assert(Index < Args.size() &&
- "Indexing out of bounds AMDGPUVariadicMCExpr sub-expr");
+const MCExpr *AMDGPUMCExpr::getSubExpr(size_t Index) const {
+ assert(Index < Args.size() && "Indexing out of bounds AMDGPUMCExpr sub-expr");
return Args[Index];
}
-void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
- const MCAsmInfo *MAI) const {
+void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
switch (Kind) {
default:
- llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
+ llvm_unreachable("Unknown AMDGPUMCExpr kind.");
case AGVK_Or:
OS << "or(";
break;
@@ -86,21 +82,19 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
OS << ')';
}
-static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1,
- int64_t Arg2) {
+static int64_t op(AMDGPUMCExpr::VariantKind Kind, int64_t Arg1, int64_t Arg2) {
switch (Kind) {
default:
- llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
- case AMDGPUVariadicMCExpr::AGVK_Max:
+ llvm_unreachable("Unknown AMDGPUMCExpr kind.");
+ case AMDGPUMCExpr::AGVK_Max:
return std::max(Arg1, Arg2);
- case AMDGPUVariadicMCExpr::AGVK_Or:
+ case AMDGPUMCExpr::AGVK_Or:
return Arg1 | Arg2;
}
}
-bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
MCValue MCVal;
if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -112,7 +106,7 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
};
assert(Args.size() == 3 &&
- "AMDGPUVariadic Argument count incorrect for ExtraSGPRs");
+ "AMDGPUMCExpr Argument count incorrect for ExtraSGPRs");
const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
uint64_t VCCUsed = 0, FlatScrUsed = 0, XNACKUsed = 0;
@@ -129,9 +123,8 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
return true;
}
-bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateTotalNumVGPR(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
MCValue MCVal;
if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -142,7 +135,7 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
return true;
};
assert(Args.size() == 2 &&
- "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs");
+ "AMDGPUMCExpr Argument count incorrect for TotalNumVGPRs");
const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
uint64_t NumAGPR = 0, NumVGPR = 0;
@@ -158,9 +151,8 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
return true;
}
-bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateAlignTo(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
MCValue MCVal;
if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -172,7 +164,7 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
};
assert(Args.size() == 2 &&
- "AMDGPUVariadic Argument count incorrect for AlignTo");
+ "AMDGPUMCExpr Argument count incorrect for AlignTo");
uint64_t Value = 0, Align = 0;
if (!TryGetMCExprValue(Args[0], Value) || !TryGetMCExprValue(Args[1], Align))
return false;
@@ -181,9 +173,8 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
return true;
}
-bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
MCValue MCVal;
if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -194,7 +185,7 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
return true;
};
assert(Args.size() == 7 &&
- "AMDGPUVariadic Argument count incorrect for Occupancy");
+ "AMDGPUMCExpr Argument count incorrect for Occupancy");
uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation,
NumSGPRs, NumVGPRs;
@@ -226,8 +217,9 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
return true;
}
-bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
- MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
+bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
std::optional<int64_t> Total;
switch (Kind) {
@@ -258,12 +250,12 @@ bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
return true;
}
-void AMDGPUVariadicMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+void AMDGPUMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
for (const MCExpr *Arg : Args)
Streamer.visitUsedExpr(*Arg);
}
-MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
+MCFragment *AMDGPUMCExpr::findAssociatedFragment() const {
for (const MCExpr *Arg : Args) {
if (Arg->findAssociatedFragment())
return Arg->findAssociatedFragment();
@@ -275,18 +267,19 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
/// are unresolvable but needed for further MCExprs). Derived from
/// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp.
///
-const AMDGPUVariadicMCExpr *
-AMDGPUVariadicMCExpr::createExtraSGPRs(const MCExpr *VCCUsed,
- const MCExpr *FlatScrUsed,
- bool XNACKUsed, MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createExtraSGPRs(const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed,
+ bool XNACKUsed,
+ MCContext &Ctx) {
return create(AGVK_ExtraSGPRs,
{VCCUsed, FlatScrUsed, MCConstantExpr::create(XNACKUsed, Ctx)},
Ctx);
}
-const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
- const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
+ const MCExpr *NumVGPR,
+ MCContext &Ctx) {
return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
}
@@ -295,10 +288,11 @@ const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
/// Remove dependency on GCNSubtarget and depend only only the necessary values
/// for said occupancy computation. Should match computeOccupancy implementation
/// without passing \p STM on.
-const AMDGPUVariadicMCExpr *
-AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
- const MCExpr *NumVGPRs,
- const GCNSubtarget &STM, MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc,
+ const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ const GCNSubtarget &STM,
+ MCContext &Ctx) {
unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index f92350b59235..207a619d45a1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -17,7 +17,7 @@ namespace llvm {
class Function;
class GCNSubtarget;
-/// AMDGPU target specific variadic MCExpr operations.
+/// AMDGPU target specific MCExpr operations.
///
/// Takes in a minimum of 1 argument to be used with an operation. The supported
/// operations are:
@@ -27,9 +27,9 @@ class GCNSubtarget;
/// \note If the 'or'/'max' operations are provided only a single argument, the
/// operation will act as a no-op and simply resolve as the provided argument.
///
-class AMDGPUVariadicMCExpr : public MCTargetExpr {
+class AMDGPUMCExpr : public MCTargetExpr {
public:
- enum VariadicKind {
+ enum VariantKind {
AGVK_None,
AGVK_Or,
AGVK_Max,
@@ -40,14 +40,13 @@ public:
};
private:
- VariadicKind Kind;
+ VariantKind Kind;
MCContext &Ctx;
const MCExpr **RawArgs;
ArrayRef<const MCExpr *> Args;
- AMDGPUVariadicMCExpr(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
- MCContext &Ctx);
- ~AMDGPUVariadicMCExpr();
+ AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
+ ~AMDGPUMCExpr();
bool evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout,
const MCFixup *Fixup) const;
@@ -59,40 +58,39 @@ private:
const MCFixup *Fixup) const;
public:
- static const AMDGPUVariadicMCExpr *
- create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
+ static const AMDGPUMCExpr *
+ create(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
- static const AMDGPUVariadicMCExpr *createOr(ArrayRef<const MCExpr *> Args,
- MCContext &Ctx) {
- return create(VariadicKind::AGVK_Or, Args, Ctx);
+ static const AMDGPUMCExpr *createOr(ArrayRef<const MCExpr *> Args,
+ MCContext &Ctx) {
+ return create(VariantKind::AGVK_Or, Args, Ctx);
}
- static const AMDGPUVariadicMCExpr *createMax(ArrayRef<const MCExpr *> Args,
- MCContext &Ctx) {
- return create(VariadicKind::AGVK_Max, Args, Ctx);
+ static const AMDGPUMCExpr *createMax(ArrayRef<const MCExpr *> Args,
+ MCContext &Ctx) {
+ return create(VariantKind::AGVK_Max, Args, Ctx);
}
- static const AMDGPUVariadicMCExpr *createExtraSGPRs(const MCExpr *VCCUsed,
- const MCExpr *FlatScrUsed,
- bool XNACKUsed,
- MCContext &Ctx);
+ static const AMDGPUMCExpr *createExtraSGPRs(const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed,
+ bool XNACKUsed, MCContext &Ctx);
- static const AMDGPUVariadicMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR,
- const MCExpr *NumVGPR,
- MCContext &Ctx);
+ static const AMDGPUMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR,
+ const MCExpr *NumVGPR,
+ MCContext &Ctx);
- static const AMDGPUVariadicMCExpr *
+ static const AMDGPUMCExpr *
createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) {
- return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx);
+ return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
}
- static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc,
- const MCExpr *NumSGPRs,
- const MCExpr *NumVGPRs,
- const GCNSubtarget &STM,
- MCContext &Ctx);
+ static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc,
+ const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ const GCNSubtarget &STM,
+ MCContext &Ctx);
- VariadicKind getKind() const { return Kind; }
+ VariantKind getKind() const { return Kind; }
const MCExpr *getSubExpr(size_t Index) const;
void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index e805e964ffe4..531031b58034 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -319,8 +319,9 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
- const MCKernelDescriptor &KD, uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr) {
+ const MCKernelDescriptor &KD, const MCExpr *NextVGPR,
+ const MCExpr *NextSGPR, const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) {
IsaVersion IVersion = getIsaVersion(STI.getCPU());
const MCAsmInfo *MAI = getContext().getAsmInfo();
@@ -339,16 +340,25 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
OS << '\n';
};
+ auto EmitMCExpr = [&](const MCExpr *Value) {
+ int64_t evaluatableValue;
+ if (Value->evaluateAsAbsolute(evaluatableValue)) {
+ OS << static_cast<uint64_t>(evaluatableValue);
+ } else {
+ Value->print(OS, MAI);
+ }
+ };
+
OS << "\t\t.amdhsa_group_segment_fixed_size ";
- KD.group_segment_fixed_size->print(OS, MAI);
+ EmitMCExpr(KD.group_segment_fixed_size);
OS << '\n';
OS << "\t\t.amdhsa_private_segment_fixed_size ";
- KD.private_segment_fixed_size->print(OS, MAI);
+ EmitMCExpr(KD.private_segment_fixed_size);
OS << '\n';
OS << "\t\t.amdhsa_kernarg_size ";
- KD.kernarg_size->print(OS, MAI);
+ EmitMCExpr(KD.kernarg_size);
OS << '\n';
PrintField(
@@ -433,8 +443,13 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
".amdhsa_system_vgpr_workitem_id");
// These directives are required.
- OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
- OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
+ OS << "\t\t.amdhsa_next_free_vgpr ";
+ EmitMCExpr(NextVGPR);
+ OS << '\n';
+
+ OS << "\t\t.amdhsa_next_free_sgpr ";
+ EmitMCExpr(NextSGPR);
+ OS << '\n';
if (AMDGPU::isGFX90A(STI)) {
// MCExpr equivalent of taking the (accum_offset + 1) * 4.
@@ -447,19 +462,19 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
accum_bits = MCBinaryExpr::createMul(
accum_bits, MCConstantExpr::create(4, getContext()), getContext());
OS << "\t\t.amdhsa_accum_offset ";
- int64_t IVal;
- if (accum_bits->evaluateAsAbsolute(IVal)) {
- OS << static_cast<uint64_t>(IVal);
- } else {
- accum_bits->print(OS, MAI);
- }
+ EmitMCExpr(accum_bits);
OS << '\n';
}
- if (!ReserveVCC)
- OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
- if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI))
- OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
+ OS << "\t\t.amdhsa_reserve_vcc ";
+ EmitMCExpr(ReserveVCC);
+ OS << '\n';
+
+ if (IVersion.Major >= 7 && !hasArchitectedFlatScratch(STI)) {
+ OS << "\t\t.amdhsa_reserve_flat_scratch ";
+ EmitMCExpr(ReserveFlatScr);
+ OS << '\n';
+ }
switch (CodeObjectVersion) {
default:
@@ -915,8 +930,9 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
- const MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
+ const MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR,
+ const MCExpr *NextSGPR, const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) {
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index e5c90060cb5d..bf1538c71d15 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -94,8 +94,9 @@ public:
virtual void
EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
const AMDGPU::MCKernelDescriptor &KernelDescriptor,
- uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr) {}
+ const MCExpr *NextVGPR, const MCExpr *NextSGPR,
+ const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) {}
static StringRef getArchNameFromElfMach(unsigned ElfMach);
static unsigned getElfMach(StringRef GPU);
@@ -151,8 +152,9 @@ public:
void
EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
const AMDGPU::MCKernelDescriptor &KernelDescriptor,
- uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr) override;
+ const MCExpr *NextVGPR, const MCExpr *NextSGPR,
+ const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) override;
};
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -207,8 +209,9 @@ public:
void
EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
const AMDGPU::MCKernelDescriptor &KernelDescriptor,
- uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr) override;
+ const MCExpr *NextVGPR, const MCExpr *NextSGPR,
+ const MCExpr *ReserveVCC,
+ const MCExpr *ReserveFlatScr) override;
};
}
#endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
index 22d0594e2b86..56a23e26b8d9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
@@ -21,7 +21,6 @@ using namespace llvm;
void R600InstPrinter::printInst(const MCInst *MI, uint64_t Address,
StringRef Annot, const MCSubtargetInfo &STI,
raw_ostream &O) {
- O.flush();
printInstruction(MI, Address, O);
printAnnotation(O, Annot);
}
diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
index 0a96c643d9bd..1a73fdf028c9 100644
--- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
@@ -113,8 +113,8 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -140,9 +140,9 @@ public:
FuncRep = &MF;
MLI = &getAnalysis<MachineLoopInfo>();
LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
- MDT = &getAnalysis<MachineDominatorTree>();
- LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
- PDT = &getAnalysis<MachinePostDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ LLVM_DEBUG(MDT->print(dbgs()););
+ PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
LLVM_DEBUG(PDT->print(dbgs()););
prepare();
run();
@@ -1629,8 +1629,8 @@ void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer",
"AMDGPU CFG Structurizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer",
"AMDGPU CFG Structurizer", false, false)
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 77935cb4cde1..8bac570d59d4 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -103,8 +103,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index 59e274787590..64185db02ec1 100644
--- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -35,8 +35,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index a00ca625fc73..68c5f23c8e11 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -162,8 +162,8 @@ public:
StringRef getPassName() const override { return "SI Fix SGPR copies"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -173,7 +173,7 @@ public:
INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
@@ -611,8 +611,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
-
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 5c411a095587..7bf6a635158e 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1519,6 +1519,9 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
case AMDGPU::V_MAX_F64_e64:
case AMDGPU::V_MAX_NUM_F64_e64:
case AMDGPU::V_PK_MAX_F16: {
+ if (MI.mayRaiseFPException())
+ return nullptr;
+
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
return nullptr;
@@ -1565,6 +1568,9 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
return false;
+ if (Def->mayRaiseFPException())
+ return false;
+
MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
if (!DefClamp)
return false;
@@ -1650,7 +1656,9 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
Op == AMDGPU::V_MUL_F16_fake16_e64) &&
- MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
+ MFI->getMode().FP64FP16Denormals.Output !=
+ DenormalMode::PreserveSign) ||
+ MI.mayRaiseFPException())
return std::pair(nullptr, SIOutMods::NONE);
const MachineOperand *RegOp = nullptr;
@@ -1725,6 +1733,9 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
return false;
+ if (Def->mayRaiseFPException())
+ return false;
+
// Clamp is applied after omod. If the source already has clamp set, don't
// fold it.
if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d8667affdb4..83bfb622ee52 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -791,8 +791,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
// Split vector operations.
setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
- ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
- ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
+ ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
+ ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
ISD::SSUBSAT},
VT, Custom);
@@ -859,19 +859,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
- MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
+ MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
+ MVT::i8},
Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN,
- {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
- MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
- MVT::i16, MVT::i8, MVT::i128},
+ {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
+ MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
+ MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
+ MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
Custom);
setOperationAction(ISD::INTRINSIC_VOID,
- {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
- MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
- MVT::i8, MVT::i128},
+ {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
+ MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
+ MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
+ MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
Custom);
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
@@ -942,6 +945,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_UMIN,
ISD::ATOMIC_LOAD_UMAX,
ISD::ATOMIC_LOAD_FADD,
+ ISD::ATOMIC_LOAD_FMIN,
+ ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
ISD::INTRINSIC_VOID,
@@ -1109,29 +1114,33 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
}
-static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
+ const DataLayout &DL, Type *Ty,
+ unsigned MaxNumLanes) {
assert(MaxNumLanes != 0);
+ LLVMContext &Ctx = Ty->getContext();
if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
- return EVT::getVectorVT(Ty->getContext(),
- EVT::getEVT(VT->getElementType()),
+ return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
NumElts);
}
- return EVT::getEVT(Ty);
+ return TLI.getValueType(DL, Ty);
}
// Peek through TFE struct returns to only use the data size.
-static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
+ const DataLayout &DL, Type *Ty,
+ unsigned MaxNumLanes) {
auto *ST = dyn_cast<StructType>(Ty);
if (!ST)
- return memVTFromLoadIntrData(Ty, MaxNumLanes);
+ return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
// TFE intrinsics return an aggregate type.
assert(ST->getNumContainedTypes() == 2 &&
ST->getContainedType(1)->isIntegerTy(32));
- return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
+ return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
}
/// Map address space 7 to MVT::v5i32 because that's its in-memory
@@ -1200,9 +1209,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOVolatile;
Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
- unsigned MaxNumLanes = 4;
-
if (RsrcIntr->IsImage) {
+ unsigned MaxNumLanes = 4;
+
const AMDGPU::ImageDimIntrinsicInfo *Intr
= AMDGPU::getImageDimIntrinsicInfo(IntrID);
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
@@ -1215,9 +1224,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
= cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
}
- }
- Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
+ Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
+ CI.getType(), MaxNumLanes);
+ } else {
+ Info.memVT =
+ memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
+ std::numeric_limits<unsigned>::max());
+ }
// FIXME: What does alignment mean for an image?
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1229,9 +1243,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
if (RsrcIntr->IsImage) {
unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
- Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
+ Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
+ DMaskLanes);
} else
- Info.memVT = EVT::getEVT(DataTy);
+ Info.memVT = getValueType(MF.getDataLayout(), DataTy);
Info.flags |= MachineMemOperand::MOStore;
} else {
@@ -1265,7 +1280,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
switch (IntrID) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1280,19 +1294,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
- case Intrinsic::amdgcn_buffer_atomic_fadd: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
- Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
- Info.align.reset();
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-
- const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
- if (!Vol || !Vol->isZero())
- Info.flags |= MachineMemOperand::MOVolatile;
-
- return true;
- }
case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1449,7 +1450,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_ordered_add:
@@ -1610,6 +1610,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return false;
}
+ if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+ AM.BaseOffs < 0) {
+ // Scalar (non-buffer) loads can only use a negative offset if
+ // soffset+offset is non-negative. Since the compiler can only prove that
+ // in a few special cases, it is safer to claim that negative offsets are
+ // not supported.
+ return false;
+ }
+
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
return true;
@@ -2468,6 +2478,12 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(FlatScratchInitReg);
}
+ if (UserSGPRInfo.hasPrivateSegmentSize()) {
+ Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
+ MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(PrivateSegmentSizeReg);
+ }
+
// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
// these from the dispatch pointer.
}
@@ -5811,6 +5827,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:
return lowerDEBUGTRAP(Op, DAG);
+ case ISD::ABS:
case ISD::FABS:
case ISD::FNEG:
case ISD::FCANONICALIZE:
@@ -6097,6 +6114,184 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
}
+static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ unsigned ValSize = VT.getSizeInBits();
+ unsigned IID = N->getConstantOperandVal(0);
+ bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
+ IID == Intrinsic::amdgcn_permlanex16;
+ SDLoc SL(N);
+ MVT IntVT = MVT::getIntegerVT(ValSize);
+
+ auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
+ SDValue Src2, MVT ValT) -> SDValue {
+ SmallVector<SDValue, 8> Operands;
+ switch (IID) {
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ Operands.push_back(N->getOperand(6));
+ Operands.push_back(N->getOperand(5));
+ Operands.push_back(N->getOperand(4));
+ [[fallthrough]];
+ case Intrinsic::amdgcn_writelane:
+ Operands.push_back(Src2);
+ [[fallthrough]];
+ case Intrinsic::amdgcn_readlane:
+ Operands.push_back(Src1);
+ [[fallthrough]];
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_permlane64:
+ Operands.push_back(Src0);
+ break;
+ default:
+ llvm_unreachable("unhandled lane op");
+ }
+
+ Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
+ std::reverse(Operands.begin(), Operands.end());
+
+ if (SDNode *GL = N->getGluedNode()) {
+ assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+ GL = GL->getOperand(0).getNode();
+ Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+ SDValue(GL, 0)));
+ }
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
+ };
+
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1, Src2;
+ if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
+ IsPermLane16) {
+ Src1 = N->getOperand(2);
+ if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
+ Src2 = N->getOperand(3);
+ }
+
+ if (ValSize == 32) {
+ // Already legal
+ return SDValue();
+ }
+
+ if (ValSize < 32) {
+ bool IsFloat = VT.isFloatingPoint();
+ Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
+ SL, MVT::i32);
+
+ if (IsPermLane16) {
+ Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
+ SL, MVT::i32);
+ }
+
+ if (IID == Intrinsic::amdgcn_writelane) {
+ Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
+ SL, MVT::i32);
+ }
+
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
+ SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+ return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
+ }
+
+ if (ValSize % 32 != 0)
+ return SDValue();
+
+ auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
+ EVT VT = N->getValueType(0);
+ unsigned NE = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ SmallVector<SDValue, 8> Scalars;
+ unsigned NumOperands = N->getNumOperands();
+ SmallVector<SDValue, 4> Operands(NumOperands);
+ SDNode *GL = N->getGluedNode();
+
+ // only handle convergencectrl_glue
+ assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+
+ for (unsigned i = 0; i != NE; ++i) {
+ for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
+ ++j) {
+ SDValue Operand = N->getOperand(j);
+ EVT OperandVT = Operand.getValueType();
+ if (OperandVT.isVector()) {
+ // A vector operand; extract a single element.
+ EVT OperandEltVT = OperandVT.getVectorElementType();
+ Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
+ Operand, DAG.getVectorIdxConstant(i, SL));
+ } else {
+ // A scalar operand; just use it as is.
+ Operands[j] = Operand;
+ }
+ }
+
+ if (GL)
+ Operands[NumOperands - 1] =
+ DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
+ SDValue(GL->getOperand(0).getNode(), 0));
+
+ Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
+ }
+
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
+ return DAG.getBuildVector(VecVT, SL, Scalars);
+ };
+
+ if (VT.isVector()) {
+ switch (MVT::SimpleValueType EltTy =
+ VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ case MVT::i32:
+ case MVT::f32: {
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
+ return unrollLaneOp(LaneOp.getNode());
+ }
+ case MVT::i16:
+ case MVT::f16:
+ case MVT::bf16: {
+ MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
+ SmallVector<SDValue, 4> Pieces;
+ SDValue Src0SubVec, Src1SubVec, Src2SubVec;
+ for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
+ Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ if (IsPermLane16)
+ Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+
+ Pieces.push_back(
+ IsPermLane16
+ ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
+ : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
+ EltIdx += 2;
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
+ }
+ default:
+ // Handle all other cases by bitcasting to i32 vectors
+ break;
+ }
+ }
+
+ MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+ Src0 = DAG.getBitcast(VecVT, Src0);
+
+ if (IsPermLane16)
+ Src1 = DAG.getBitcast(VecVT, Src1);
+
+ if (IID == Intrinsic::amdgcn_writelane)
+ Src2 = DAG.getBitcast(VecVT, Src2);
+
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
+ SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
+ return DAG.getBitcast(VT, UnrolledLaneOp);
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -8563,6 +8758,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_addrspacecast_nonnull:
return lowerADDRSPACECAST(Op, DAG);
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_permlane64:
+ return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -8609,12 +8811,6 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
M->getMemOperand());
}
-// Return a value to use for the idxen operand by examining the vindex operand.
-static unsigned getIdxEn(SDValue VIndex) {
- // No need to set idxen if vindex is known to be zero.
- return isNullConstant(VIndex) ? 0 : 1;
-}
-
SDValue
SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
unsigned NewOpcode) const {
@@ -8703,78 +8899,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
- case Intrinsic::amdgcn_ds_fadd: {
- MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc;
- switch (IntrID) {
- case Intrinsic::amdgcn_ds_fadd:
- Opc = ISD::ATOMIC_LOAD_FADD;
- break;
- }
-
- return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
- M->getOperand(0), M->getOperand(2), M->getOperand(3),
- M->getMemOperand());
- }
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc;
- switch (IntrID) {
- case Intrinsic::amdgcn_ds_fmin:
- Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
- break;
- case Intrinsic::amdgcn_ds_fmax:
- Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
- break;
- default:
- llvm_unreachable("Unknown intrinsic!");
- }
- SDValue Ops[] = {
- M->getOperand(0), // Chain
- M->getOperand(2), // Ptr
- M->getOperand(3) // Value
- };
-
- return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
- }
- case Intrinsic::amdgcn_buffer_load:
- case Intrinsic::amdgcn_buffer_load_format: {
- unsigned Glc = Op.getConstantOperandVal(5);
- unsigned Slc = Op.getConstantOperandVal(6);
- unsigned IdxEn = getIdxEn(Op.getOperand(3));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
-
- unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
- AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
- EVT VT = Op.getValueType();
- EVT IntVT = VT.changeTypeToInteger();
- auto *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
-
- // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
- M->getMemOperand());
-
- return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand(), DAG);
+ unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN
+ : ISD::ATOMIC_LOAD_FMAX;
+ return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0),
+ M->getOperand(2), M->getOperand(3),
+ M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
@@ -8825,35 +8957,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
- case Intrinsic::amdgcn_tbuffer_load: {
- MemSDNode *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
- unsigned Dfmt = Op.getConstantOperandVal(7);
- unsigned Nfmt = Op.getConstantOperandVal(8);
- unsigned Glc = Op.getConstantOperandVal(9);
- unsigned Slc = Op.getConstantOperandVal(10);
- unsigned IdxEn = getIdxEn(Op.getOperand(3));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- Op.getOperand(4), // voffset
- SOffset, // soffset
- Op.getOperand(6), // offset
- DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
- };
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
- return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
- DAG);
- }
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -8908,94 +9011,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
DAG);
}
- case Intrinsic::amdgcn_buffer_atomic_swap:
- case Intrinsic::amdgcn_buffer_atomic_add:
- case Intrinsic::amdgcn_buffer_atomic_sub:
- case Intrinsic::amdgcn_buffer_atomic_csub:
- case Intrinsic::amdgcn_buffer_atomic_smin:
- case Intrinsic::amdgcn_buffer_atomic_umin:
- case Intrinsic::amdgcn_buffer_atomic_smax:
- case Intrinsic::amdgcn_buffer_atomic_umax:
- case Intrinsic::amdgcn_buffer_atomic_and:
- case Intrinsic::amdgcn_buffer_atomic_or:
- case Intrinsic::amdgcn_buffer_atomic_xor:
- case Intrinsic::amdgcn_buffer_atomic_fadd: {
- unsigned Slc = Op.getConstantOperandVal(6);
- unsigned IdxEn = getIdxEn(Op.getOperand(4));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
-
- EVT VT = Op.getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- unsigned Opcode = 0;
-
- switch (IntrID) {
- case Intrinsic::amdgcn_buffer_atomic_swap:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
- break;
- case Intrinsic::amdgcn_buffer_atomic_add:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
- break;
- case Intrinsic::amdgcn_buffer_atomic_sub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
- break;
- case Intrinsic::amdgcn_buffer_atomic_csub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
- break;
- case Intrinsic::amdgcn_buffer_atomic_smin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
- break;
- case Intrinsic::amdgcn_buffer_atomic_umin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
- break;
- case Intrinsic::amdgcn_buffer_atomic_smax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
- break;
- case Intrinsic::amdgcn_buffer_atomic_umax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
- break;
- case Intrinsic::amdgcn_buffer_atomic_and:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
- break;
- case Intrinsic::amdgcn_buffer_atomic_or:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
- break;
- case Intrinsic::amdgcn_buffer_atomic_xor:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
- break;
- case Intrinsic::amdgcn_buffer_atomic_fadd:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
- break;
- default:
- llvm_unreachable("unhandled atomic opcode");
- }
-
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
- case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
- return lowerRawBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
- case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
- return lowerStructBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
@@ -9092,29 +9113,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
- case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
- unsigned Slc = Op.getConstantOperandVal(7);
- unsigned IdxEn = getIdxEn(Op.getOperand(5));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // src
- Op.getOperand(3), // cmp
- Op.getOperand(4), // rsrc
- Op.getOperand(5), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
-
- EVT VT = Op.getValueType();
- auto *M = cast<MemSDNode>(Op);
-
- return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
- Op->getVTList(), Ops, VT, M->getMemOperand());
- }
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
@@ -9313,22 +9311,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin_num: {
- Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ Opcode = ISD::ATOMIC_LOAD_FMIN;
break;
}
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmax_num: {
- Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ Opcode = ISD::ATOMIC_LOAD_FMAX;
break;
}
default:
llvm_unreachable("unhandled atomic opcode");
}
- return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
- M->getVTList(), Ops, M->getMemoryVT(),
- M->getMemOperand());
+ return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
+ Ops, M->getMemOperand());
}
case Intrinsic::amdgcn_s_get_barrier_state: {
SDValue Chain = Op->getOperand(0);
@@ -9557,34 +9554,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue();
};
- case Intrinsic::amdgcn_tbuffer_store: {
- SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
- if (IsD16)
- VData = handleD16VData(VData, DAG);
- unsigned Dfmt = Op.getConstantOperandVal(8);
- unsigned Nfmt = Op.getConstantOperandVal(9);
- unsigned Glc = Op.getConstantOperandVal(10);
- unsigned Slc = Op.getConstantOperandVal(11);
- unsigned IdxEn = getIdxEn(Op.getOperand(4));
- SDValue Ops[] = {
- Chain,
- VData, // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- Op.getOperand(5), // voffset
- Op.getOperand(6), // soffset
- Op.getOperand(7), // offset
- DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
- AMDGPUISD::TBUFFER_STORE_FORMAT;
- MemSDNode *M = cast<MemSDNode>(Op);
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
- }
case Intrinsic::amdgcn_struct_tbuffer_store:
case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
@@ -9642,42 +9611,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
M->getMemoryVT(), M->getMemOperand());
}
- case Intrinsic::amdgcn_buffer_store:
- case Intrinsic::amdgcn_buffer_store_format: {
- SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
- if (IsD16)
- VData = handleD16VData(VData, DAG);
- unsigned Glc = Op.getConstantOperandVal(6);
- unsigned Slc = Op.getConstantOperandVal(7);
- unsigned IdxEn = getIdxEn(Op.getOperand(4));
- SDValue Ops[] = {
- Chain,
- VData,
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
-
- unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
- AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
- Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
- MemSDNode *M = cast<MemSDNode>(Op);
-
- // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
- EVT VDataType = VData.getValueType().getScalarType();
- if (VDataType == MVT::i8 || VDataType == MVT::i16)
- return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
-
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
- }
-
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_ptr_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
@@ -10083,8 +10016,8 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
return {N0, SDValue(C1, 0)};
}
-// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
-// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
+// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
+// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
SelectionDAG &DAG, SDValue *Offsets,
@@ -10215,7 +10148,7 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
EVT VDataType, SDLoc DL,
SDValue Ops[],
MemSDNode *M) const {
- if (VDataType == MVT::f16)
+ if (VDataType == MVT::f16 || VDataType == MVT::bf16)
Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
@@ -16063,8 +15996,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case ISD::INTRINSIC_W_CHAIN:
return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
case AMDGPUISD::ATOMIC_CMP_SWAP:
- case AMDGPUISD::ATOMIC_LOAD_FMIN:
- case AMDGPUISD::ATOMIC_LOAD_FMAX:
case AMDGPUISD::BUFFER_ATOMIC_SWAP:
case AMDGPUISD::BUFFER_ATOMIC_ADD:
case AMDGPUISD::BUFFER_ATOMIC_SUB:
@@ -16080,7 +16011,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
case AMDGPUISD::BUFFER_ATOMIC_CSUB:
case AMDGPUISD::BUFFER_ATOMIC_FADD:
- case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16:
case AMDGPUISD::BUFFER_ATOMIC_FMIN:
case AMDGPUISD::BUFFER_ATOMIC_FMAX:
// Target-specific read-modify-write atomics are sources of divergence.
@@ -16173,6 +16103,26 @@ static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
<< " operation at memory scope " << MemScope;
}
+static bool isHalf2OrBFloat2(Type *Ty) {
+ if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+ Type *EltTy = VT->getElementType();
+ return VT->getNumElements() == 2 &&
+ (EltTy->isHalfTy() || EltTy->isBFloatTy());
+ }
+
+ return false;
+}
+
+static bool isHalf2(Type *Ty) {
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+ return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
+}
+
+static bool isBFloat2(Type *Ty) {
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+ return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
@@ -16231,7 +16181,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
: AtomicExpansionKind::CmpXChg;
}
- // TODO: Handle v2f16/v2bf16 cases for gfx940
+ if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
+ return AtomicExpansionKind::None;
+
return AtomicExpansionKind::CmpXChg;
}
@@ -16239,10 +16191,36 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
AS != AMDGPUAS::BUFFER_FAT_POINTER)
return AtomicExpansionKind::CmpXChg;
- // TODO: gfx940 supports v2f16 and v2bf16
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
return AtomicExpansionKind::None;
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ // gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
+ // gfx90a, gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+ // gfx90a, gfx940, gfx12
+ // FIXME: Needs to account for no fine-grained memory
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
+ // buffer. gfx12 does have the buffer version.
+ if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ }
+
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;
@@ -16284,17 +16262,51 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
}
case AtomicRMWInst::FMin:
- case AtomicRMWInst::FMax:
+ case AtomicRMWInst::FMax: {
+ Type *Ty = RMW->getType();
+
+ // LDS float and double fmin/fmax were always supported.
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
+ return AtomicExpansionKind::None;
+
+ if (unsafeFPAtomicsDisabled(RMW->getFunction()))
+ return AtomicExpansionKind::CmpXChg;
+
+ // Always expand system scope fp atomics.
+ if (HasSystemScope)
+ return AtomicExpansionKind::CmpXChg;
+
+ // For flat and global cases:
+ // float, double in gfx7. Manual claims denormal support.
+ // Removed in gfx8.
+ // float, double restored in gfx10.
+ // double removed again in gfx11, so only f32 for gfx11/gfx12.
+ //
+ // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
+ // f32.
+ //
+ // FIXME: Check scope and fine grained memory
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+ if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ }
+
+ return AtomicExpansionKind::CmpXChg;
+ }
case AtomicRMWInst::Min:
case AtomicRMWInst::Max:
case AtomicRMWInst::UMin:
case AtomicRMWInst::UMax: {
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- if (RMW->getType()->isFloatTy() &&
- unsafeFPAtomicsDisabled(RMW->getFunction()))
- return AtomicExpansionKind::CmpXChg;
-
// Always expand system scope min/max atomics.
if (HasSystemScope)
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 4c02bb1b306e..1f198a92c0fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -253,9 +253,9 @@ public:
bool shouldExpandVectorDynExt(SDNode *N) const;
private:
- // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
- // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
- // pointed to by Offsets.
+ // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
+ // the three offsets (voffset, soffset and instoffset) into the SDValue[3]
+ // array pointed to by Offsets.
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
SDValue *Offsets, Align Alignment = Align(4)) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 230443313d72..4c53a081cdb2 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -641,7 +641,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineLoopInfo>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addUsedIfAvailable<AAResultsWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -1118,7 +1118,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
@@ -2398,7 +2398,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MLI = &getAnalysis<MachineLoopInfo>();
- PDT = &getAnalysis<MachinePostDominatorTree>();
+ PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
AA = &AAR->getAAResults();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d8e21da8019a..cc1b9ac0c9ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
- case AMDGPU::ENTER_PSEUDO_WM:
- case AMDGPU::EXIT_PSEUDO_WM: {
- // These do nothing.
- MI.eraseFromParent();
- break;
- }
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -3978,7 +3972,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Dst)
.add(*Src0)
.add(*Src1)
- .addImm(Imm);
+ .addImm(Imm)
+ .setMIFlags(MI.getFlags());
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
@@ -3997,7 +3992,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Dst)
.add(*Src0)
.addImm(Imm)
- .add(*Src2);
+ .add(*Src2)
+ .setMIFlags(MI.getFlags());
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
@@ -4018,7 +4014,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Dst)
.add(*Src1)
.addImm(Imm)
- .add(*Src2);
+ .add(*Src2)
+ .setMIFlags(MI.getFlags());
updateLiveVariables(LV, MI, *MIB);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *MIB);
@@ -4054,7 +4051,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.addImm(Src2Mods ? Src2Mods->getImm() : 0)
.add(*Src2)
.addImm(Clamp ? Clamp->getImm() : 0)
- .addImm(Omod ? Omod->getImm() : 0);
+ .addImm(Omod ? Omod->getImm() : 0)
+ .setMIFlags(MI.getFlags());
if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
MIB.addImm(OpSel ? OpSel->getImm() : 0);
updateLiveVariables(LV, MI, *MIB);
@@ -5657,24 +5655,9 @@ unsigned SIInstrInfo::buildExtractSubReg(
DebugLoc DL = MI->getDebugLoc();
Register SubReg = MRI.createVirtualRegister(SubRC);
- if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
- BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
- .addReg(SuperReg.getReg(), 0, SubIdx);
- return SubReg;
- }
-
- // Just in case the super register is itself a sub-register, copy it to a new
- // value so we don't need to worry about merging its subreg index with the
- // SubIdx passed to this function. The register coalescer should be able to
- // eliminate this extra copy.
- Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
-
- BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
- .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
-
+ unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
- .addReg(NewSuperReg, 0, SubIdx);
-
+ .addReg(SuperReg.getReg(), 0, NewSubIdx);
return SubReg;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 40289f2addfd..c64b3a7c356f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -72,14 +72,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
]>;
-def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
-def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
// load_d16_{lo|hi} ptr, tied_input
def SIload_d16 : SDTypeProfile<1, 2, [
SDTCisPtrTy<1>,
@@ -222,7 +214,6 @@ defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
-defm SIbuffer_atomic_fadd_bf16 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD_BF16">;
defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
defm SIbuffer_atomic_cond_sub_u32 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32">;
@@ -315,13 +306,6 @@ class isIntType<ValueType SrcVT> {
}
//===----------------------------------------------------------------------===//
-// PatFrags for global memory operations
-//===----------------------------------------------------------------------===//
-
-defm atomic_load_fmin : binary_atomic_op_all_as<SIatomic_fmin, 0>;
-defm atomic_load_fmax : binary_atomic_op_all_as<SIatomic_fmax, 0>;
-
-//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
// This is for SDNodes and PatFrag for local loads and stores to
// enable s_mov_b32 m0, -1 to be glued to the memory instructions.
@@ -709,15 +693,24 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
>;
let AddressSpaces = StoreAddress_local.AddrSpaces in {
- defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
- defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
- IsInt>;
+
+ if IsInt then {
+ defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ } else {
+ defm _local_m0 : binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>;
+ defm _local_m0 : noret_binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>;
+ }
}
let AddressSpaces = StoreAddress_region.AddrSpaces in {
- defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
- defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
- IsInt>;
+ if IsInt then {
+ defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ } else {
+ defm _region_m0 : binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>;
+ defm _region_m0 : noret_binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>;
+ }
}
}
@@ -734,8 +727,8 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
-defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
-defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 0, SDTAtomic2_f32, 0>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 0, SDTAtomic2_f32, 0>;
def as_i1timm : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
@@ -2233,13 +2226,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
// Return an AGPR+VGPR operand class for the given VGPR register class.
class getLdStRegisterOperand<RegisterClass RC> {
RegisterOperand ret =
- !if(!eq(RC.Size, 32), AVLdSt_32,
- !if(!eq(RC.Size, 64), AVLdSt_64,
- !if(!eq(RC.Size, 96), AVLdSt_96,
- !if(!eq(RC.Size, 128), AVLdSt_128,
- !if(!eq(RC.Size, 160), AVLdSt_160,
- RegisterOperand<VReg_1> // invalid register
- )))));
+ !cond(!eq(RC.Size, 32) : AVLdSt_32,
+ !eq(RC.Size, 64) : AVLdSt_64,
+ !eq(RC.Size, 96) : AVLdSt_96,
+ !eq(RC.Size, 128) : AVLdSt_128,
+ !eq(RC.Size, 160) : AVLdSt_160,
+ !eq(RC.Size, 1024) : AVLdSt_1024);
}
class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32,
@@ -2271,6 +2263,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit EnableClamp = _EnableClamp;
field bit IsTrue16 = 0;
field bit IsRealTrue16 = 0;
+ field bit IsInvalidSingleUseConsumer = 0;
+ field bit IsInvalidSingleUseProducer = 0;
field ValueType DstVT = ArgVT[0];
field ValueType Src0VT = ArgVT[1];
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1b844f844c3..835f44f9d0d6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m
def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
} // End usesCustomInserter = 1
-// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
-def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
- let Uses = [EXEC];
- let Defs = [EXEC];
- let hasSideEffects = 0;
- let mayLoad = 0;
- let mayStore = 0;
-}
-
-def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
- let hasSideEffects = 0;
- let mayLoad = 0;
- let mayStore = 0;
-}
-
// Pseudo instructions used for @llvm.fptrunc.round upward
// and @llvm.fptrunc.round downward.
// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
@@ -252,16 +237,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
// restoring it after we're done.
let Defs = [SCC], isConvergent = 1 in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VSrc_b32: $src, VSrc_b32:$inactive),
- [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
-}
+ (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VSrc_b64: $src, VSrc_b64:$inactive),
- [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
-}
+ (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
} // End Defs = [SCC]
+foreach vt = Reg32Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
+}
+
+foreach vt = Reg64Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+}
+
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
@@ -3398,7 +3389,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
- (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;
@@ -3872,11 +3863,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
let mayStore = 1;
}
-let Namespace = "AMDGPU" in {
-def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
-def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
-}
-
class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
@@ -3901,7 +3887,6 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
-def G_AMDGPU_BUFFER_ATOMIC_FADD_BF16 : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index abb72e8e63c3..afc6353ec811 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -48,8 +48,8 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -60,7 +60,7 @@ char SILateBranchLowering::ID = 0;
INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE,
"SI insert s_cbranch_execz instructions", false, false)
@@ -149,7 +149,7 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 5dc3457b5bfa..75a1575f2180 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -149,7 +149,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addUsedIfAvailable<LiveIntervals>();
// Should preserve the same set that TwoAddressInstructions does.
- AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
AU.addPreservedID(LiveVariablesID);
@@ -764,7 +764,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
LIS = getAnalysisIfAvailable<LiveIntervals>();
// This doesn't actually need LiveVariables, but we can preserve them.
LV = getAnalysisIfAvailable<LiveVariables>();
- MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+ auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+ MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
MRI = &MF.getRegInfo();
BoolRC = TRI->getBoolRC();
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 32dad0c425c0..a9ee74dec120 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -51,8 +51,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -399,8 +399,8 @@ private:
INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
false)
@@ -445,8 +445,9 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
MachineFunctionProperties::Property::Selected))
return false;
- Vreg1LoweringHelper Helper(&TheMF, &getAnalysis<MachineDominatorTree>(),
- &getAnalysis<MachinePostDominatorTree>());
+ Vreg1LoweringHelper Helper(
+ &TheMF, &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(),
+ &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree());
bool Changed = false;
Changed |= Helper.lowerCopiesFromI1();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 072c5aedc220..d9db0f7a4f53 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -83,7 +83,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (CC != CallingConv::AMDGPU_Gfx)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
- // TODO: Pick a high register, and shift down, similar to a kernel.
FrameOffsetReg = AMDGPU::SGPR33;
StackPtrOffsetReg = AMDGPU::SGPR32;
@@ -233,6 +232,12 @@ Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
return ArgInfo.FlatScratchInit.getRegister();
}
+Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) {
+ ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
+ NumUserSGPRs += 1;
+ return ArgInfo.PrivateSegmentSize.getRegister();
+}
+
Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9fe02e24c8a1..7af5e7388f84 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -752,6 +752,7 @@ public:
Register addKernargSegmentPtr(const SIRegisterInfo &TRI);
Register addDispatchID(const SIRegisterInfo &TRI);
Register addFlatScratchInit(const SIRegisterInfo &TRI);
+ Register addPrivateSegmentSize(const SIRegisterInfo &TRI);
Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
Register addLDSKernelId();
SmallVectorImpl<MCRegister> *
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 8204a70e72d9..18d66e419152 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -148,10 +148,10 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveVariables>();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<LiveVariables>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addPreserved<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -618,7 +618,7 @@ char SIOptimizeVGPRLiveRange::ID = 0;
INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
"SI Optimize VGPR LiveRange", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(LiveVariables)
INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
@@ -635,7 +635,7 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
Loops = &getAnalysis<MachineLoopInfo>();
LV = &getAnalysis<LiveVariables>();
MRI = &MF.getRegInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 1fadd8ce45b1..f47731bf6aac 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -37,20 +37,22 @@ STATISTIC(NumSDWAInstructionsPeepholed,
namespace {
+bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
+ const SIInstrInfo *TII);
class SDWAOperand;
class SDWADstOperand;
-class SIPeepholeSDWA : public MachineFunctionPass {
-public:
- using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
+class SIPeepholeSDWA : public MachineFunctionPass {
private:
MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
- MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+ SDWAOperandsMap PotentialMatches;
SmallVector<MachineInstr *, 8> ConvertedInstructions;
std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
@@ -65,7 +67,6 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
- bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
void pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
@@ -93,7 +94,9 @@ public:
virtual ~SDWAOperand() = default;
- virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+ virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) = 0;
virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
MachineOperand *getTargetOperand() const { return Target; }
@@ -126,7 +129,9 @@ public:
: SDWAOperand(TargetOp, ReplacedOp),
SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getSrcSel() const { return SrcSel; }
@@ -153,7 +158,9 @@ public:
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
: SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getDstSel() const { return DstSel; }
@@ -327,7 +334,33 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
return Mods;
}
-MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
+ if (PotentialMatches != nullptr) {
+ // Fill out the map for all uses if all can be converted
+ MachineOperand *Reg = getReplacedOperand();
+ if (!Reg->isReg() || !Reg->isDef())
+ return nullptr;
+
+ for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
+ // Check that all instructions that use Reg can be converted
+ if (!isConvertibleToSDWA(UseMI, ST, TII))
+ return nullptr;
+
+ // Now that it's guaranteed all uses are legal, iterate over the uses again
+ // to add them for later conversion.
+ for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+ // Should not get a subregister here
+ assert(isSameReg(UseMO, *Reg));
+
+ SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
+ MachineInstr *UseMI = UseMO.getParent();
+ potentialMatchesMap[UseMI].push_back(this);
+ }
+ return nullptr;
+ }
+
// For SDWA src operand potential instruction is one that use register
// defined by parent instruction
MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
@@ -420,7 +453,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return true;
}
-MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
// For SDWA dst operand potential instruction is one that defines register
// that this operand uses
MachineRegisterInfo *MRI = getMRI();
@@ -919,8 +954,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
}
-bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
- const GCNSubtarget &ST) const {
+namespace {
+bool isConvertibleToSDWA(MachineInstr &MI,
+ const GCNSubtarget &ST,
+ const SIInstrInfo* TII) {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
if (TII->isSDWA(Opc))
@@ -980,6 +1017,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
return true;
}
+} // namespace
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
@@ -1215,7 +1253,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
matchSDWAOperands(MBB);
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
if (PotentialMI &&
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
@@ -1228,8 +1266,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 398f870a9f53..5837dbeb3f98 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM ||
- Opc == AMDGPU::ENTER_PSEUDO_WM) {
+ if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
dbgs() << "Entering ";
} else {
- assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM ||
- Opc == AMDGPU::EXIT_PSEUDO_WM);
+ assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Exiting ";
}
if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
dbgs() << "Strict WWM ";
- } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) {
- dbgs() << "Pseudo WWM/WQM ";
} else {
assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
dbgs() << "Strict WQM ";
@@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
}
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
- MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
+ MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = true;
continue;
}
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
- MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM ||
- MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) {
+ MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
LLVM_DEBUG(printWWMInfo(MI));
InWWM = false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 0d40816cdd4b..212edff09783 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -161,45 +161,6 @@ static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift,
return Val;
}
-uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
- int64_t VBlocks, SBlocks;
- VGPRBlocks->evaluateAsAbsolute(VBlocks);
- SGPRBlocks->evaluateAsAbsolute(SBlocks);
-
- uint64_t Reg = S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
- S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)) |
- getComputePGMRSrc1Reg(*this, ST);
-
- return Reg;
-}
-
-uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
- const GCNSubtarget &ST) const {
- if (AMDGPU::isCompute(CC)) {
- return getComputePGMRSrc1(ST);
- }
- int64_t VBlocks, SBlocks;
- VGPRBlocks->evaluateAsAbsolute(VBlocks);
- SGPRBlocks->evaluateAsAbsolute(SBlocks);
-
- return getPGMRSrc1Reg(*this, CC, ST) |
- S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
- S_00B848_SGPRS(static_cast<uint64_t>(SBlocks));
-}
-
-uint64_t SIProgramInfo::getComputePGMRSrc2() const {
- int64_t ScratchEn;
- ScratchEnable->evaluateAsAbsolute(ScratchEn);
- return ScratchEn | getComputePGMRSrc2Reg(*this);
-}
-
-uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
- if (AMDGPU::isCompute(CC))
- return getComputePGMRSrc2();
-
- return 0;
-}
-
const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST,
MCContext &Ctx) const {
uint64_t Reg = getComputePGMRSrc1Reg(*this, ST);
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index e66e5a194c8b..c358a2d9db10 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -98,16 +98,12 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
void reset(const MachineFunction &MF);
/// Compute the value of the ComputePGMRsrc1 register.
- uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const;
- uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const;
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
MCContext &Ctx) const;
const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST,
MCContext &Ctx) const;
/// Compute the value of the ComputePGMRsrc2 register.
- uint64_t getComputePGMRSrc2() const;
- uint64_t getPGMRSrc2(CallingConv::ID CC) const;
const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const;
const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const;
};
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4b5f9bdd82b8..4c5e60c873bb 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3157,7 +3157,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
MachineInstr &Use,
MachineRegisterInfo &MRI,
LiveIntervals *LIS) const {
- auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
+ auto &MDT = LIS->getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
SlotIndex UseIdx = LIS->getInstructionIndex(Use);
SlotIndex DefIdx;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index caac7126068e..f1d9aec16363 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -586,7 +586,9 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, v4bf16]>;
+def Reg96Types : RegisterTypes<[v3i32, v3f32]>;
+def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>;
let HasVGPR = 1 in {
// VOP3 and VINTERP can access 256 lo and 256 hi registers.
@@ -744,7 +746,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16,
let BaseClassOrder = 10000;
}
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32,
(add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
@@ -815,7 +817,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v
let HasSGPR = 1;
}
-def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", Reg64Types.types, 32,
(add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 1;
@@ -905,8 +907,8 @@ multiclass SRegClass<int numRegs,
}
}
-defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>;
+defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>;
defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
@@ -958,8 +960,8 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4],
(add VGPR_64)>;
-defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>;
+defm VReg_96 : VRegClass<3, Reg96Types.types, (add VGPR_96)>;
+defm VReg_128 : VRegClass<4, Reg128Types.types, (add VGPR_128)>;
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
@@ -1342,6 +1344,7 @@ def AVLdSt_64 : AVLdStOperand<AV_64, "OPW64">;
def AVLdSt_96 : AVLdStOperand<AV_96, "OPW96">;
def AVLdSt_128 : AVLdStOperand<AV_128, "OPW128">;
def AVLdSt_160 : AVLdStOperand<AV_160, "OPW160">;
+def AVLdSt_1024 : AVLdStOperand<AV_1024, "OPW1024">;
//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 647fae904d39..79bcf5e8cd30 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -45,7 +45,6 @@ public:
bool isKImmOperand(const MachineOperand &Src) const;
bool isKUImmOperand(const MachineOperand &Src) const;
bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
- bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
void shrinkScalarCompare(MachineInstr &MI) const;
void shrinkMIMG(MachineInstr &MI) const;
@@ -183,15 +182,36 @@ bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
return false;
}
-/// \returns true if the constant in \p Src should be replaced with a bitreverse
-/// of an inline immediate.
-bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
- int32_t &ReverseImm) const {
- if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
- return false;
+/// \returns the opcode of an instruction a move immediate of the constant \p
+/// Src can be replaced with if the constant is replaced with \p ModifiedImm.
+/// i.e.
+///
+/// If the bitreverse of a constant is an inline immediate, reverse the
+/// immediate and return the bitreverse opcode.
+///
+/// If the bitwise negation of a constant is an inline immediate, reverse the
+/// immediate and return the bitwise not opcode.
+static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII,
+ const MachineOperand &Src,
+ int32_t &ModifiedImm, bool Scalar) {
+ if (TII->isInlineConstant(Src))
+ return 0;
+ int32_t SrcImm = static_cast<int32_t>(Src.getImm());
+
+ if (!Scalar) {
+ // We could handle the scalar case with here, but we would need to check
+ // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth
+ // it, as the reasonable values are already covered by s_movk_i32.
+ ModifiedImm = ~SrcImm;
+ if (TII->isInlineConstant(APInt(32, ModifiedImm)))
+ return AMDGPU::V_NOT_B32_e32;
+ }
+
+ ModifiedImm = reverseBits<int32_t>(SrcImm);
+ if (TII->isInlineConstant(APInt(32, ModifiedImm)))
+ return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32;
- ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
- return ReverseImm >= -16 && ReverseImm <= 64;
+ return 0;
}
/// Copy implicit register operands from specified instruction to this
@@ -801,10 +821,12 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// XXX - not exactly a check for post-regalloc run.
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
- int32_t ReverseImm;
- if (isReverseInlineImm(Src, ReverseImm)) {
- MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
- Src.setImm(ReverseImm);
+ int32_t ModImm;
+ unsigned ModOpcode =
+ canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/false);
+ if (ModOpcode != 0) {
+ MI.setDesc(TII->get(ModOpcode));
+ Src.setImm(static_cast<int64_t>(ModImm));
continue;
}
}
@@ -863,13 +885,15 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() && Dst.getReg().isPhysical()) {
- int32_t ReverseImm;
+ unsigned ModOpc;
+ int32_t ModImm;
if (isKImmOperand(Src)) {
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
Src.setImm(SignExtend64(Src.getImm(), 32));
- } else if (isReverseInlineImm(Src, ReverseImm)) {
- MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
- Src.setImm(ReverseImm);
+ } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm,
+ /*Scalar=*/true))) {
+ MI.setDesc(TII->get(ModOpc));
+ Src.setImm(static_cast<int64_t>(ModImm));
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 913942dda19d..742fd397ff9e 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -215,8 +215,6 @@ private:
MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
bool IsWQM);
MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
- void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
- MachineInstr *Exit);
void lowerBlock(MachineBasicBlock &MBB);
void processBlock(MachineBasicBlock &MBB, bool IsEntry);
@@ -241,8 +239,8 @@ public:
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
- AU.addPreserved<MachineDominatorTree>();
- AU.addPreserved<MachinePostDominatorTree>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -259,8 +257,8 @@ char SIWholeQuadMode::ID = 0;
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
@@ -785,7 +783,7 @@ MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
if (MDT)
MDT->getBase().applyUpdates(DTUpdates);
if (PDT)
- PDT->getBase().applyUpdates(DTUpdates);
+ PDT->applyUpdates(DTUpdates);
// Link blocks
MachineInstr *MI =
@@ -1025,31 +1023,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
return NewTerm;
}
-// Convert a strict mode transition to a pseudo transition.
-// This still pre-allocates registers to prevent clobbering,
-// but avoids any EXEC mask changes.
-void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
- MachineInstr *Entry,
- MachineInstr *Exit) {
- assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
- assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
-
- Register SaveOrig = Entry->getOperand(0).getReg();
-
- MachineInstr *NewEntry =
- BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
- MachineInstr *NewExit =
- BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
-
- LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
- Exit->eraseFromParent();
-
- LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
- Entry->eraseFromParent();
-
- LIS->removeInterval(SaveOrig);
-}
-
// Replace (or supplement) instructions accessing live mask.
// This can only happen once all the live mask registers have been created
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
@@ -1066,12 +1039,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
SmallVector<MachineInstr *, 4> SplitPoints;
char State = BI.InitialState;
- MachineInstr *StrictEntry = nullptr;
for (MachineInstr &MI : llvm::make_early_inc_range(
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
- char PreviousState = State;
-
if (StateTransition.count(&MI))
State = StateTransition[&MI];
@@ -1084,20 +1054,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
- case AMDGPU::ENTER_STRICT_WQM:
- StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
- break;
- case AMDGPU::EXIT_STRICT_WQM:
- if (State == StateWQM && StrictEntry) {
- // Transition WQM -> StrictWQM -> WQM detected.
- lowerPseudoStrictMode(MBB, StrictEntry, &MI);
- }
- StrictEntry = nullptr;
- break;
- case AMDGPU::ENTER_STRICT_WWM:
- case AMDGPU::EXIT_STRICT_WWM:
- StrictEntry = nullptr;
- break;
default:
break;
}
@@ -1251,11 +1207,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = StrictStateNeeded;
-
- // Mark block as needing lower so it will be checked for unnecessary transitions.
- auto BII = Blocks.find(&MBB);
- if (BII != Blocks.end())
- BII->second.NeedsLowering = true;
}
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
@@ -1687,8 +1638,11 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
- MDT = getAnalysisIfAvailable<MachineDominatorTree>();
- PDT = getAnalysisIfAvailable<MachinePostDominatorTree>();
+ auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+ MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
+ auto *PDTWrapper =
+ getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
+ PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
if (ST->isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index aee518680a60..64f33199545a 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -215,6 +215,11 @@ let isMoveImm = 1 in {
} // End Uses = [SCC]
} // End isMoveImm = 1
+// Variant of S_MOV_B32 used for reading from volatile registers like
+// SRC_POPS_EXITING_WAVE_ID.
+let hasSideEffects = 1 in
+def S_MOV_B32_sideeffects : SOP1_32 <"s_mov_b32">;
+
let Defs = [SCC] in {
def S_NOT_B32 : SOP1_32 <"s_not_b32",
[(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))]
@@ -1196,11 +1201,15 @@ let SubtargetPredicate = isGFX9Plus in {
}
} // End SubtargetPredicate = isGFX9Plus
+def VersionImm : S16ImmOperand {
+ let DecoderMethod = "decodeVersionImm";
+}
+
let SubtargetPredicate = isGFX10Plus in {
def S_VERSION : SOPK_Pseudo<
"s_version",
(outs),
- (ins s16imm:$simm16),
+ (ins VersionImm:$simm16),
"$simm16"> {
let has_sdst = 0;
}
@@ -1876,6 +1885,12 @@ let SubtargetPredicate = isNotGFX9Plus in {
def : GetFPModePat<fpmode_mask_gfx6plus>;
}
+let SubtargetPredicate = isGFX9GFX10 in
+def : GCNPat<
+ (int_amdgcn_pops_exiting_wave_id),
+ (S_MOV_B32_sideeffects (i32 SRC_POPS_EXITING_WAVE_ID))
+>;
+
//===----------------------------------------------------------------------===//
// SOP2 Patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 2e1db1665b9c..3af536dac473 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -669,5 +669,20 @@ const char* const IdSymbolic[] = {
} // namespace VGPRIndexMode
+namespace UCVersion {
+
+ArrayRef<GFXVersion> getGFXVersions() {
+ // GFX6, GFX8 and GFX9 don't support s_version and there are no
+ // UC_VERSION_GFX* codes for them.
+ static const GFXVersion Versions[] = {{"UC_VERSION_GFX7", 0},
+ {"UC_VERSION_GFX10", 4},
+ {"UC_VERSION_GFX11", 6},
+ {"UC_VERSION_GFX12", 9}};
+
+ return Versions;
+}
+
+} // namespace UCVersion
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index 069134a7ae7f..c84c1a7dc18c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -116,6 +116,17 @@ extern const char* const IdSymbolic[];
} // namespace VGPRIndexMode
+namespace UCVersion {
+
+struct GFXVersion {
+ StringLiteral Symbol;
+ unsigned Code;
+};
+
+ArrayRef<GFXVersion> getGFXVersions();
+
+} // namespace UCVersion
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4b34fb27632a..9886235121d2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -159,6 +159,12 @@ namespace llvm {
namespace AMDGPU {
+/// \returns true if the target supports signed immediate offset for SMRD
+/// instructions.
+bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
+ return isGFX9Plus(ST);
+}
+
/// \returns True if \p STI is AMDHSA.
bool isHsaAbi(const MCSubtargetInfo &STI) {
return STI.getTargetTriple().getOS() == Triple::AMDHSA;
@@ -373,10 +379,18 @@ struct VOPTrue16Info {
bool IsTrue16;
};
+struct SingleUseExceptionInfo {
+ uint16_t Opcode;
+ bool IsInvalidSingleUseConsumer;
+ bool IsInvalidSingleUseProducer;
+};
+
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
#define GET_MUBUFInfoTable_IMPL
+#define GET_SingleUseExceptionTable_DECL
+#define GET_SingleUseExceptionTable_IMPL
#define GET_SMInfoTable_DECL
#define GET_SMInfoTable_IMPL
#define GET_VOP1InfoTable_DECL
@@ -582,9 +596,7 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
}
bool isGenericAtomic(unsigned Opc) {
- return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
- Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
- Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
+ return Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN ||
@@ -608,6 +620,16 @@ bool isTrue16Inst(unsigned Opc) {
return Info ? Info->IsTrue16 : false;
}
+bool isInvalidSingleUseConsumerInst(unsigned Opc) {
+ const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
+ return Info && Info->IsInvalidSingleUseConsumer;
+}
+
+bool isInvalidSingleUseProducerInst(unsigned Opc) {
+ const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
+ return Info && Info->IsInvalidSingleUseProducer;
+}
+
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
return Info ? Info->Opcode3Addr : ~0u;
@@ -2803,10 +2825,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
return isGCN3Encoding(ST) || isGFX10Plus(ST);
}
-static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
- return isGFX9Plus(ST);
-}
-
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset) {
if (isGFX12Plus(ST))
@@ -2841,7 +2859,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
}
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
- int64_t ByteOffset, bool IsBuffer) {
+ int64_t ByteOffset, bool IsBuffer,
+ bool HasSOffset) {
+ // For unbuffered smem loads, it is illegal for the Immediate Offset to be
+ // negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
+ // Handle case where SOffset is not present.
+ if (!IsBuffer && !HasSOffset && ByteOffset < 0 && hasSMRDSignedImmOffset(ST))
+ return std::nullopt;
+
if (isGFX12Plus(ST)) // 24 bit signed offsets
return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset)
: std::nullopt;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index cf8236b8e23b..af2f0bc1a630 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -360,6 +360,10 @@ struct EncodingField {
static ValueType decode(uint64_t Encoded) { return Encoded; }
};
+// Represents a single bit in an encoded value.
+template <unsigned Bit, unsigned D = 0>
+using EncodingBit = EncodingField<Bit, Bit, D>;
+
// A helper for encoding and decoding multiple fields.
template <typename... Fields> struct EncodingFields {
static constexpr uint64_t encode(Fields... Values) {
@@ -857,6 +861,12 @@ LLVM_READONLY
bool isTrue16Inst(unsigned Opc);
LLVM_READONLY
+bool isInvalidSingleUseConsumerInst(unsigned Opc);
+
+LLVM_READONLY
+bool isInvalidSingleUseProducerInst(unsigned Opc);
+
+LLVM_READONLY
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
LLVM_READONLY
@@ -1297,6 +1307,7 @@ bool hasVOPD(const MCSubtargetInfo &STI);
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
unsigned hasKernargPreload(const MCSubtargetInfo &STI);
+bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -1469,7 +1480,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset);
/// S_LOAD instructions have a signed offset, on other subtargets it is
/// unsigned. S_BUFFER has an unsigned offset for all subtargets.
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
- int64_t ByteOffset, bool IsBuffer);
+ int64_t ByteOffset, bool IsBuffer,
+ bool HasSOffset = false);
/// \return The encoding that can be used for a 32-bit literal offset in an SMRD
/// instruction. This is only useful on CI.s
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp
new file mode 100644
index 000000000000..a4f4a9ed5da4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp
@@ -0,0 +1,61 @@
+//===- AMDGPUDelayedMCExpr.cpp - Delayed MCExpr resolve ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUDelayedMCExpr.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+using namespace llvm;
+
+static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type,
+ MCValue Val) {
+ msgpack::Document *Doc = DN.getDocument();
+ switch (Type) {
+ default:
+ return Doc->getEmptyNode();
+ case msgpack::Type::Int:
+ return Doc->getNode(static_cast<int64_t>(Val.getConstant()));
+ case msgpack::Type::UInt:
+ return Doc->getNode(static_cast<uint64_t>(Val.getConstant()));
+ case msgpack::Type::Boolean:
+ return Doc->getNode(static_cast<bool>(Val.getConstant()));
+ }
+}
+
+void DelayedMCExprs::assignDocNode(msgpack::DocNode &DN, msgpack::Type Type,
+ const MCExpr *ExprValue) {
+ MCValue Res;
+ if (ExprValue->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+ if (Res.isAbsolute()) {
+ DN = getNode(DN, Type, Res);
+ return;
+ }
+ }
+
+ DelayedExprs.push_back(Expr{DN, Type, ExprValue});
+}
+
+bool DelayedMCExprs::resolveDelayedExpressions() {
+ while (!DelayedExprs.empty()) {
+ Expr DE = DelayedExprs.front();
+ MCValue Res;
+
+ if (!DE.ExprValue->evaluateAsRelocatable(Res, nullptr, nullptr) ||
+ !Res.isAbsolute())
+ return false;
+
+ DelayedExprs.pop_front();
+ DE.DN = getNode(DE.DN, DE.Type, Res);
+ }
+
+ return true;
+}
+
+void DelayedMCExprs::clear() { DelayedExprs.clear(); }
+
+bool DelayedMCExprs::empty() { return DelayedExprs.empty(); }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h
new file mode 100644
index 000000000000..8c9cda3a1bdd
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h
@@ -0,0 +1,39 @@
+//===- AMDGPUDelayedMCExpr.h - Delayed MCExpr resolve -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H
+
+#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include <deque>
+
+namespace llvm {
+class MCExpr;
+
+class DelayedMCExprs {
+ struct Expr {
+ msgpack::DocNode &DN;
+ msgpack::Type Type;
+ const MCExpr *ExprValue;
+ Expr(msgpack::DocNode &DN, msgpack::Type Type, const MCExpr *ExprValue)
+ : DN(DN), Type(Type), ExprValue(ExprValue) {}
+ };
+
+ std::deque<Expr> DelayedExprs;
+
+public:
+ bool resolveDelayedExpressions();
+ void assignDocNode(msgpack::DocNode &DN, msgpack::Type Type,
+ const MCExpr *ExprValue);
+ void clear();
+ bool empty();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 0fa67c559cb2..a53bf70d7771 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -20,6 +20,7 @@
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Module.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/EndianStream.h"
@@ -137,12 +138,22 @@ void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, unsigned Val) {
setRegister(getRsrc1Reg(CC), Val);
}
+void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ setRegister(getRsrc1Reg(CC), Val, Ctx);
+}
+
// Set the rsrc2 register in the metadata for a particular shader stage.
// In fact this ORs the value into any previous setting of the register.
void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, unsigned Val) {
setRegister(getRsrc1Reg(CC) + 1, Val);
}
+void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ setRegister(getRsrc1Reg(CC) + 1, Val, Ctx);
+}
+
// Set the SPI_PS_INPUT_ENA register in the metadata.
// In fact this ORs the value into any previous setting of the register.
void AMDGPUPALMetadata::setSpiPsInputEna(unsigned Val) {
@@ -182,6 +193,40 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, unsigned Val) {
N = N.getDocument()->getNode(Val);
}
+// Set a register in the metadata.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val,
+ MCContext &Ctx) {
+ if (!isLegacy()) {
+ // In the new MsgPack format, ignore register numbered >= 0x10000000. It
+ // is a PAL ABI pseudo-register in the old non-MsgPack format.
+ if (Reg >= 0x10000000)
+ return;
+ }
+ auto &N = getRegisters()[MsgPackDoc.getNode(Reg)];
+ auto ExprIt = REM.find(Reg);
+
+ if (ExprIt != REM.end()) {
+ Val = MCBinaryExpr::createOr(Val, ExprIt->getSecond(), Ctx);
+ // This conditional may be redundant most of the time, but the alternate
+ // setRegister(unsigned, unsigned) could've been called while the
+ // conditional returns true (i.e., Reg exists in REM).
+ if (N.getKind() == msgpack::Type::UInt) {
+ const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
+ Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
+ }
+ ExprIt->getSecond() = Val;
+ } else if (N.getKind() == msgpack::Type::UInt) {
+ const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
+ Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
+ int64_t Unused;
+ if (!Val->evaluateAsAbsolute(Unused))
+ REM[Reg] = Val;
+ (void)Unused;
+ }
+ DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val);
+}
+
// Set the entry point name for one shader.
void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) {
if (isLegacy())
@@ -207,11 +252,29 @@ void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ unsigned NumUsedVgprsKey = getScratchSizeKey(CC) +
+ PALMD::Key::VS_NUM_USED_VGPRS -
+ PALMD::Key::VS_SCRATCH_SIZE;
+ setRegister(NumUsedVgprsKey, Val, Ctx);
+ return;
+ }
+ // Msgpack format.
+ setHwStage(CC, ".vgpr_count", msgpack::Type::UInt, Val);
+}
+
// Set the number of used agprs in the metadata.
void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".agpr_count"] = Val;
}
+void AMDGPUPALMetadata::setNumUsedAgprs(unsigned CC, const MCExpr *Val) {
+ setHwStage(CC, ".agpr_count", msgpack::Type::UInt, Val);
+}
+
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
@@ -228,6 +291,20 @@ void AMDGPUPALMetadata::setNumUsedSgprs(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".sgpr_count"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setNumUsedSgprs(unsigned CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ unsigned NumUsedSgprsKey = getScratchSizeKey(CC) +
+ PALMD::Key::VS_NUM_USED_SGPRS -
+ PALMD::Key::VS_SCRATCH_SIZE;
+ setRegister(NumUsedSgprsKey, Val, Ctx);
+ return;
+ }
+ // Msgpack format.
+ setHwStage(CC, ".sgpr_count", msgpack::Type::UInt, Val);
+}
+
// Set the scratch size in the metadata.
void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
if (isLegacy()) {
@@ -239,6 +316,17 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setScratchSize(unsigned CC, const MCExpr *Val,
+ MCContext &Ctx) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ setRegister(getScratchSizeKey(CC), Val, Ctx);
+ return;
+ }
+ // Msgpack format.
+ setHwStage(CC, ".scratch_memory_size", msgpack::Type::UInt, Val);
+}
+
// Set the stack frame size of a function in the metadata.
void AMDGPUPALMetadata::setFunctionScratchSize(StringRef FnName, unsigned Val) {
auto Node = getShaderFunction(FnName);
@@ -259,6 +347,12 @@ void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName,
Node[".vgpr_count"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName,
+ const MCExpr *Val) {
+ auto Node = getShaderFunction(FnName);
+ DelayedExprs.assignDocNode(Node[".vgpr_count"], msgpack::Type::UInt, Val);
+}
+
// Set the number of used vgprs in the metadata.
void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName,
unsigned Val) {
@@ -266,6 +360,12 @@ void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName,
Node[".sgpr_count"] = MsgPackDoc.getNode(Val);
}
+void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName,
+ const MCExpr *Val) {
+ auto Node = getShaderFunction(FnName);
+ DelayedExprs.assignDocNode(Node[".sgpr_count"], msgpack::Type::UInt, Val);
+}
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -662,6 +762,7 @@ void AMDGPUPALMetadata::toString(std::string &String) {
String.clear();
if (!BlobType)
return;
+ ResolvedAll = DelayedExprs.resolveDelayedExpressions();
raw_string_ostream Stream(String);
if (isLegacy()) {
if (MsgPackDoc.getRoot().getKind() == msgpack::Type::Nil)
@@ -711,6 +812,7 @@ void AMDGPUPALMetadata::toString(std::string &String) {
// a .note record of the specified AMD type. Returns an empty blob if
// there is no PAL metadata,
void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) {
+ ResolvedAll = DelayedExprs.resolveDelayedExpressions();
if (Type == ELF::NT_AMD_PAL_METADATA)
toLegacyBlob(Blob);
else if (Type)
@@ -906,11 +1008,17 @@ void AMDGPUPALMetadata::setLegacy() {
// Erase all PAL metadata.
void AMDGPUPALMetadata::reset() {
MsgPackDoc.clear();
+ REM.clear();
+ DelayedExprs.clear();
Registers = MsgPackDoc.getEmptyNode();
HwStages = MsgPackDoc.getEmptyNode();
ShaderFunctions = MsgPackDoc.getEmptyNode();
}
+bool AMDGPUPALMetadata::resolvedAllMCExpr() {
+ return ResolvedAll && DelayedExprs.empty();
+}
+
unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) {
assert(idx < 2 &&
"illegal index to PAL version - should be 0 (major) or 1 (minor)");
@@ -942,6 +1050,11 @@ void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, bool Val) {
getHwStage(CC)[field] = Val;
}
+void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field,
+ msgpack::Type Type, const MCExpr *Val) {
+ DelayedExprs.assignDocNode(getHwStage(CC)[field], Type, Val);
+}
+
void AMDGPUPALMetadata::setComputeRegisters(StringRef field, unsigned Val) {
getComputeRegisters()[field] = Val;
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 158f766d0485..e05532afed2f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -13,7 +13,10 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
+#include "AMDGPUDelayedMCExpr.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include "llvm/MC/MCContext.h"
namespace llvm {
@@ -21,6 +24,10 @@ class Module;
class StringRef;
class AMDGPUPALMetadata {
+public:
+ using RegisterExprMap = DenseMap<unsigned, const MCExpr *>;
+
+private:
unsigned BlobType = 0;
msgpack::Document MsgPackDoc;
msgpack::DocNode Registers;
@@ -32,6 +39,10 @@ class AMDGPUPALMetadata {
msgpack::DocNode ComputeRegisters;
msgpack::DocNode GraphicsRegisters;
+ DelayedMCExprs DelayedExprs;
+ RegisterExprMap REM;
+ bool ResolvedAll = true;
+
public:
// Read the amdgpu.pal.metadata supplied by the frontend, ready for
// per-function modification.
@@ -45,10 +56,12 @@ public:
// Set the rsrc1 register in the metadata for a particular shader stage.
// In fact this ORs the value into any previous setting of the register.
void setRsrc1(unsigned CC, unsigned Val);
+ void setRsrc1(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the rsrc2 register in the metadata for a particular shader stage.
// In fact this ORs the value into any previous setting of the register.
void setRsrc2(unsigned CC, unsigned Val);
+ void setRsrc2(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the SPI_PS_INPUT_ENA register in the metadata.
// In fact this ORs the value into any previous setting of the register.
@@ -64,6 +77,7 @@ public:
// Set a register in the metadata.
// In fact this ORs the value into any previous setting of the register.
void setRegister(unsigned Reg, unsigned Val);
+ void setRegister(unsigned Reg, const MCExpr *Val, MCContext &Ctx);
// Set the entry point name for one shader.
void setEntryPoint(unsigned CC, StringRef Name);
@@ -72,18 +86,22 @@ public:
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of vgprs to allocate.
void setNumUsedVgprs(unsigned CC, unsigned Val);
+ void setNumUsedVgprs(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the number of used agprs in the metadata. This is an optional advisory
// record for logging etc;
void setNumUsedAgprs(unsigned CC, unsigned Val);
+ void setNumUsedAgprs(unsigned CC, const MCExpr *Val);
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
void setNumUsedSgprs(unsigned CC, unsigned Val);
+ void setNumUsedSgprs(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the scratch size in the metadata.
void setScratchSize(unsigned CC, unsigned Val);
+ void setScratchSize(unsigned CC, const MCExpr *Val, MCContext &Ctx);
// Set the stack frame size of a function in the metadata.
void setFunctionScratchSize(StringRef FnName, unsigned Val);
@@ -97,11 +115,13 @@ public:
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of vgprs to allocate.
void setFunctionNumUsedVgprs(StringRef FnName, unsigned Val);
+ void setFunctionNumUsedVgprs(StringRef FnName, const MCExpr *Val);
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
void setFunctionNumUsedSgprs(StringRef FnName, unsigned Val);
+ void setFunctionNumUsedSgprs(StringRef FnName, const MCExpr *Val);
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
@@ -138,6 +158,8 @@ public:
void setHwStage(unsigned CC, StringRef field, unsigned Val);
void setHwStage(unsigned CC, StringRef field, bool Val);
+ void setHwStage(unsigned CC, StringRef field, msgpack::Type Type,
+ const MCExpr *Val);
void setComputeRegisters(StringRef field, unsigned Val);
void setComputeRegisters(StringRef field, bool Val);
@@ -156,6 +178,8 @@ public:
// Erase all PAL metadata.
void reset();
+ bool resolvedAllMCExpr();
+
private:
// Return whether the blob type is legacy PAL metadata.
bool isLegacy() const;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index eaee1a2a9739..720d5a1853db 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -14,6 +14,7 @@
#include "AMDKernelCodeT.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/SIDefinesUtils.h"
#include "llvm/ADT/IndexedMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCContext.h"
@@ -220,43 +221,6 @@ static int get_amd_kernel_code_t_FieldIndex(StringRef name) {
return map.lookup(name) - 1; // returns -1 if not found
}
-static constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) {
- unsigned Shift = 0;
- unsigned Mask = 0;
-
- Mask = ~Value;
- for (; !(Mask & 1); Shift++, Mask >>= 1) {
- }
-
- return std::make_pair(Shift, Mask);
-}
-
-static const MCExpr *MaskShiftSet(const MCExpr *Val, uint32_t Mask,
- uint32_t Shift, MCContext &Ctx) {
- if (Mask) {
- const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
- Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
- }
- if (Shift) {
- const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
- Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
- }
- return Val;
-}
-
-static const MCExpr *MaskShiftGet(const MCExpr *Val, uint32_t Mask,
- uint32_t Shift, MCContext &Ctx) {
- if (Shift) {
- const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
- Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx);
- }
- if (Mask) {
- const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
- Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
- }
- return Val;
-}
-
class PrintField {
public:
template <typename T, T AMDGPUMCKernelCodeT::*ptr,
@@ -305,10 +269,10 @@ static ArrayRef<PrintFx> getPrinterTable() {
const MCExpr *Value; \
if (PGMType == 0) { \
Value = \
- MaskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx); \
+ maskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx); \
} else { \
Value = \
- MaskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx); \
+ maskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx); \
} \
int64_t Val; \
if (Value->evaluateAsAbsolute(Val)) \
@@ -392,7 +356,7 @@ static ArrayRef<ParseFx> getParserTable() {
if (!parseExpr(MCParser, Value, Err)) \
return false; \
auto [Shift, Mask] = getShiftMask(Complement); \
- Value = MaskShiftSet(Value, Mask, Shift, Ctx); \
+ Value = maskShiftSet(Value, Mask, Shift, Ctx); \
const MCExpr *Compl = MCConstantExpr::create(Complement, Ctx); \
if (PGMType == 0) { \
C.compute_pgm_resource1_registers = MCBinaryExpr::createAnd( \
@@ -542,7 +506,7 @@ void AMDGPUMCKernelCodeT::EmitKernelCodeT(MCStreamer &OS, MCContext &Ctx) {
const MCExpr *CodeProps = MCConstantExpr::create(code_properties, Ctx);
CodeProps = MCBinaryExpr::createOr(
CodeProps,
- MaskShiftSet(is_dynamic_callstack,
+ maskShiftSet(is_dynamic_callstack,
(1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1,
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, Ctx),
Ctx);
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 2f4ce8eaf1d6..09b8da9f5dd4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -1,6 +1,7 @@
add_llvm_component_library(LLVMAMDGPUUtils
AMDGPUAsmUtils.cpp
AMDGPUBaseInfo.cpp
+ AMDGPUDelayedMCExpr.cpp
AMDGPUMemoryUtils.cpp
AMDGPUPALMetadata.cpp
AMDKernelCodeTUtils.cpp
diff --git a/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h b/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h
new file mode 100644
index 000000000000..64d21de12c26
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h
@@ -0,0 +1,79 @@
+//===-- SIDefines.h - SI Helper Functions -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+/// \file - utility functions for the SIDefines and its common uses.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H
+
+#include "llvm/MC/MCExpr.h"
+#include <utility>
+
+namespace llvm {
+class MCContext;
+namespace AMDGPU {
+
+/// Deduce the least significant bit aligned shift and mask values for a binary
+/// Complement \p Value (as they're defined in SIDefines.h as C_*) as a returned
+/// pair<shift, mask>. That is to say \p Value == ~(mask << shift)
+///
+/// For example, given C_00B848_FWD_PROGRESS (i.e., 0x7FFFFFFF) from
+/// SIDefines.h, this will return the pair as (31,1).
+constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) {
+ unsigned Shift = 0;
+ unsigned Mask = 0;
+
+ Mask = ~Value;
+ for (; !(Mask & 1); Shift++, Mask >>= 1) {
+ }
+
+ return std::make_pair(Shift, Mask);
+}
+
+/// Provided with the MCExpr * \p Val, uint32 \p Mask and \p Shift, will return
+/// the masked and left shifted, in said order of operations, MCExpr * created
+/// within the MCContext \p Ctx.
+///
+/// For example, given MCExpr *Val, Mask == 0xf, Shift == 6 the returned MCExpr
+/// * will be the equivalent of (Val & 0xf) << 6
+inline const MCExpr *maskShiftSet(const MCExpr *Val, uint32_t Mask,
+ uint32_t Shift, MCContext &Ctx) {
+ if (Mask) {
+ const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+ Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+ }
+ if (Shift) {
+ const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+ Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
+ }
+ return Val;
+}
+
+/// Provided with the MCExpr * \p Val, uint32 \p Mask and \p Shift, will return
+/// the right shifted and masked, in said order of operations, MCExpr * created
+/// within the MCContext \p Ctx.
+///
+/// For example, given MCExpr *Val, Mask == 0xf, Shift == 6 the returned MCExpr
+/// * will be the equivalent of (Val >> 6) & 0xf
+inline const MCExpr *maskShiftGet(const MCExpr *Val, uint32_t Mask,
+ uint32_t Shift, MCContext &Ctx) {
+ if (Shift) {
+ const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+ Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx);
+ }
+ if (Mask) {
+ const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+ Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+ }
+ return Val;
+}
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index b96c41c1e12a..2c0d61ee4afa 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies {
!if(P.HasOMod,
[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))]
+ [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))]
)
);
}
@@ -249,9 +249,15 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
// FIXME: Specify SchedRW for READFIRSTLANE_B32
// TODO: There is VOP3 encoding also
def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
- getVOP1Pat<int_amdgcn_readfirstlane,
- VOP_READFIRSTLANE>.ret, 1> {
+ [], 1> {
let isConvergent = 1;
+ let IsInvalidSingleUseConsumer = 1;
+}
+
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(vt (int_amdgcn_readfirstlane (vt VRegOrLdsSrc_32:$src0))),
+ (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0))
+ >;
}
let isReMaterializable = 1 in {
@@ -362,6 +368,7 @@ defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> {
let Src0RC32 = VRegSrc_32;
let Src0RC64 = VRegSrc_32;
+ let IsInvalidSingleUseConsumer = 1;
}
// Special case because there are no true output operands. Hack vdst
@@ -405,8 +412,12 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un
let EmitDst = 1; // force vdst emission
}
-def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
-def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32>;
+let IsInvalidSingleUseProducer = 1 in {
+ def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
+ def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32> {
+ let IsInvalidSingleUseConsumer = 1;
+ }
+}
let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
// v_movreld_b32 is a special case because the destination output
@@ -535,6 +546,7 @@ let SubtargetPredicate = isGFX9Plus in {
let Constraints = "$vdst = $src1, $vdst1 = $src0";
let DisableEncoding = "$vdst1,$src1";
let SchedRW = [Write64Bit, Write64Bit];
+ let IsInvalidSingleUseConsumer = 1;
}
let isReMaterializable = 1 in
@@ -699,6 +711,8 @@ let SubtargetPredicate = isGFX10Plus in {
let Constraints = "$vdst = $src1, $vdst1 = $src0";
let DisableEncoding = "$vdst1,$src1";
let SchedRW = [Write64Bit, Write64Bit];
+ let IsInvalidSingleUseConsumer = 1;
+ let IsInvalidSingleUseProducer = 1;
}
} // End Uses = [M0]
} // End SubtargetPredicate = isGFX10Plus
@@ -718,15 +732,22 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
- getVOP1Pat<int_amdgcn_permlane64,
- VOP_MOVRELS>.ret,
- /*VOP1Only=*/ 1>;
+ [], /*VOP1Only=*/ 1> {
+ let IsInvalidSingleUseConsumer = 1;
+ let IsInvalidSingleUseProducer = 1;
+ }
defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
} // End SubtargetPredicate = isGFX11Plus
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
+ (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
+ >;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index ccb5b33dbdc4..9989752c2f6b 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -779,15 +779,25 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
} // End isCommutable = 1
// These are special and do not read the exec mask.
-let isConvergent = 1, Uses = []<Register> in {
-def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
- [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
+let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in {
+def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>;
let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
- [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> {
+ let IsInvalidSingleUseProducer = 1;
+ }
} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
+ (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1)
+ >;
+
+ def : GCNPat<(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
+ (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2)
+ >;
+}
+
let isReMaterializable = 1 in {
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 479c0aaf0174..efa8e9c74d44 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -13,9 +13,11 @@ def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> {
let Outs64 = (outs DstRC.RegClass:$vdst);
let HasExtVOP3DPP = 0;
let HasExtDPP = 0;
+ let IsSingle = 1;
}
def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
let Outs64 = (outs DstRC.RegClass:$vdst);
+ let IsSingle = 1;
}
}
@@ -105,7 +107,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
}
class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
-
+ let IsSingle = 1;
let HasOMod = !ne(DstVT.Value, f16.Value);
let HasHigh = 1;
@@ -155,12 +157,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
} // End SubtargetPredicate = isNotGFX12Plus
} // End SchedRW = [WriteDoubleAdd]
-let SchedRW = [WriteIntMul] in {
+let SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 in {
defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>;
defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
-} // End SchedRW = [WriteIntMul]
+} // End SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
@@ -258,9 +260,9 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
let isReMaterializable = 1 in
defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-let Constraints = "@earlyclobber $vdst" in {
+let Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 in {
defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
-} // End Constraints = "@earlyclobber $vdst"
+} // End Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1
let isReMaterializable = 1 in {
@@ -275,14 +277,16 @@ let SchedRW = [Write64Bit] in {
defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
} // End SubtargetPredicate = isGFX6GFX7
+ let IsInvalidSingleUseConsumer = 1 in {
let SubtargetPredicate = isGFX8Plus in {
defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>;
defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>;
- } // End SubtargetPredicate = isGFX8Plus
+ } // End SubtargetPredicate = isGFX8Plus, , IsInvalidSingleUseConsumer = 1
let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in {
defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>;
} // End SubtargetPredicate = isGFX8GFX9GFX10GFX11
+ } // End IsInvalidSingleUseConsumer = 1
} // End SchedRW = [Write64Bit]
} // End isReMaterializable = 1
@@ -307,14 +311,14 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
let HasModifiers = 0;
}
-let SubtargetPredicate = isGFX7Plus in {
+let SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 in {
let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
-} // End SubtargetPredicate = isGFX7Plus
+} // End SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1
-let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
+let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 in {
let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in {
defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
@@ -324,7 +328,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
}
-} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU]
+} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1
let FPDPRounding = 1 in {
@@ -838,9 +842,9 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">,
GISDNodeXFormEquiv<opsel_i1timm>;
class PermlanePat<SDPatternOperator permlane,
- Instruction inst> : GCNPat<
- (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
- timm:$fi, timm:$bc),
+ Instruction inst, ValueType vt> : GCNPat<
+ (vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2,
+ timm:$fi, timm:$bc)),
(inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc),
SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
>;
@@ -859,13 +863,15 @@ let SubtargetPredicate = isGFX10Plus in {
} // End isCommutable = 1, isReMaterializable = 1
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
- let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 in {
defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
- } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+ } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1
- def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>;
- def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>;
+ foreach vt = Reg32Types.types in {
+ def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
+ def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
+ }
defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
@@ -1275,11 +1281,12 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
-defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
-
-let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
- defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
-} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
+let IsInvalidSingleUseConsumer = 1 in {
+ defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
+ let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 in {
+ defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
+ } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32: $src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1
+} // End IsInvalidSingleUseConsumer = 1
let SubtargetPredicate = isGFX10Before1030 in {
defm V_MUL_LO_I32 : VOP3_Real_gfx10<0x16b>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 4c78bd94458d..4cab15435199 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -90,7 +90,7 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
let isReMaterializable = 1 in {
let isCommutable = 1 in {
defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>, imad>;
let FPDPRounding = 1 in {
defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
@@ -382,15 +382,19 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
AMDGPUfdot2, 1/*ExplicitClamp*/>;
let OtherPredicates = [HasDot7Insts] in {
-defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+let IsInvalidSingleUseConsumer = 1 in {
+ defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+}
defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
} // End OtherPredicates = [HasDot7Insts]
let OtherPredicates = [HasDot1Insts] in {
-defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
- VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+let IsInvalidSingleUseConsumer = 1 in {
+ defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+}
defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
} // End OtherPredicates = [HasDot1Insts]
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 372c4f533629..3bcee28a2cb7 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -435,8 +435,10 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
-multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+let IsInvalidSingleUseConsumer = 1 in {
+ multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+ VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+}
multiclass VOPCX_F16<string opName, string revOp = opName> {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -465,8 +467,10 @@ multiclass VOPCX_I16<string opName, string revOp = opName> {
multiclass VOPCX_I32 <string opName, string revOp = opName> :
VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
-multiclass VOPCX_I64 <string opName, string revOp = opName> :
- VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+let IsInvalidSingleUseConsumer = 1 in {
+ multiclass VOPCX_I64 <string opName, string revOp = opName> :
+ VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 5d1573d8dec1..2b05165cc94b 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -17,6 +17,8 @@ class LetDummies {
bit isReMaterializable;
bit isAsCheapAsAMove;
bit FPDPRounding;
+ bit IsInvalidSingleUseConsumer;
+ bit IsInvalidSingleUseProducer;
Predicate SubtargetPredicate;
string Constraints;
string DisableEncoding;
@@ -81,6 +83,8 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
string Mnemonic = opName;
Instruction Opcode = !cast<Instruction>(NAME);
bit IsTrue16 = P.IsTrue16;
+ bit IsInvalidSingleUseConsumer = P.IsInvalidSingleUseConsumer;
+ bit IsInvalidSingleUseProducer = P.IsInvalidSingleUseProducer;
VOPProfile Pfl = P;
string AsmOperands;
@@ -175,6 +179,8 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
class VOP_Real<VOP_Pseudo ps> {
Instruction Opcode = !cast<Instruction>(NAME);
bit IsSingle = ps.Pfl.IsSingle;
+ bit IsInvalidSingleUseConsumer = ps.Pfl.IsInvalidSingleUseConsumer;
+ bit IsInvalidSingleUseProducer = ps.Pfl.IsInvalidSingleUseProducer;
}
class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
@@ -823,17 +829,11 @@ class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P
class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
- InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>,
- VOP <OpName>,
- SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> {
-
- let isPseudo = 1;
- let isCodeGenOnly = 1;
+ VOP_Pseudo<OpName, "_dpp", P, P.OutsDPP, Ins, asmOps, pattern> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
let VALU = 1;
let DPP = 1;
@@ -846,7 +846,6 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
let isConvergent = 1;
- string Mnemonic = OpName;
string AsmOperands = asmOps;
let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
@@ -857,7 +856,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
let DecoderNamespace = "GFX8";
- VOPProfile Pfl = P;
+ let IsInvalidSingleUseConsumer = !not(VINTERP);
+ let IsInvalidSingleUseProducer = !not(VINTERP);
}
class VOP3_DPP_Pseudo <string OpName, VOPProfile P> :
@@ -1725,3 +1725,12 @@ def VOPTrue16Table : GenericTable {
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getTrue16OpcodeHelper";
}
+
+def SingleUseExceptionTable : GenericTable {
+ let FilterClass = "VOP_Pseudo";
+ let CppTypeName = "SingleUseExceptionInfo";
+ let Fields = ["Opcode", "IsInvalidSingleUseConsumer", "IsInvalidSingleUseProducer"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getSingleUseExceptionHelper";
+}
diff --git a/llvm/lib/Target/ARC/ARCBranchFinalize.cpp b/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
index 0e3e4d34aa6a..9d616e103f17 100644
--- a/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
+++ b/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
@@ -61,7 +61,7 @@ char ARCBranchFinalize::ID = 0;
INITIALIZE_PASS_BEGIN(ARCBranchFinalize, "arc-branch-finalize",
"ARC finalize branches", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(ARCBranchFinalize, "arc-branch-finalize",
"ARC finalize branches", false, false)
diff --git a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
index e7a0b352db8d..36f811c0aa00 100644
--- a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
+++ b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
@@ -60,8 +60,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -119,7 +119,7 @@ private:
char ARCOptAddrMode::ID = 0;
INITIALIZE_PASS_BEGIN(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false,
false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false,
false)
@@ -508,7 +508,7 @@ bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
AST = &MF.getSubtarget<ARCSubtarget>();
AII = AST->getInstrInfo();
MRI = &MF.getRegInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
bool Changed = false;
for (auto &MBB : MF)
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 627148b73c4f..e81e6bb69758 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6060,6 +6060,8 @@ ARMBaseInstrInfo::getOutliningCandidateInfo(
RepeatedSequenceLocs.size() * Costs.CallDefault) {
RepeatedSequenceLocs = CandidatesWithoutStackFixups;
FrameID = MachineOutlinerNoLRSave;
+ if (RepeatedSequenceLocs.size() < 2)
+ return std::nullopt;
} else
SetCandidateCallInfo(MachineOutlinerDefault, Costs.CallDefault);
}
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 9579053943f9..90f5c6c40b49 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -229,7 +229,7 @@ namespace {
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -399,7 +399,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
isPositionIndependentOrROPI =
STI->getTargetLowering()->isPositionIndependent() || STI->isROPI();
AFI = MF->getInfo<ARMFunctionInfo>();
- DT = &getAnalysis<MachineDominatorTree>();
+ DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
isThumb = AFI->isThumbFunction();
isThumb1 = AFI->isThumb1OnlyFunction();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e3270471981c..4a7da3bf9744 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -156,6 +156,17 @@ static const MCPhysReg GPRArgRegs[] = {
ARM::R0, ARM::R1, ARM::R2, ARM::R3
};
+static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg,
+ SelectionDAG &DAG, const SDLoc &DL) {
+ assert(Arg.ArgVT.isScalarInteger());
+ assert(Arg.ArgVT.bitsLT(MVT::i32));
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
+ SDValue Ext =
+ DAG.getNode(Arg.Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+ MVT::i32, Trunc);
+ return Ext;
+}
+
void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
if (VT != PromotedLdStVT) {
setOperationAction(ISD::LOAD, VT, Promote);
@@ -365,6 +376,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::FSQRT, VT, Expand);
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FTAN, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
@@ -875,6 +887,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+ setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
@@ -897,6 +910,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
+ setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
@@ -914,6 +928,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+ setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
@@ -1540,6 +1555,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
setOperationAction(ISD::FSIN, MVT::f16, Promote);
setOperationAction(ISD::FCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FTAN, MVT::f16, Promote);
setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
setOperationAction(ISD::FPOWI, MVT::f16, Promote);
setOperationAction(ISD::FPOW, MVT::f16, Promote);
@@ -1578,6 +1594,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
}
+ // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
+ // it, but it's just a wrapper around ldexp.
+ if (Subtarget->isTargetWindows()) {
+ for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+ if (isOperationExpand(Op, MVT::f32))
+ setOperationAction(Op, MVT::f32, Promote);
+ }
+
+ // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
+ // isn't legal.
+ for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+ if (isOperationExpand(Op, MVT::f16))
+ setOperationAction(Op, MVT::f16, Promote);
+
// We have target-specific dag combine patterns for the following nodes:
// ARMISD::VMOVRRD - No need to call setTargetDAGCombine
setTargetDAGCombine(
@@ -1710,7 +1740,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(ARMISD::BCC_i64)
MAKE_CASE(ARMISD::FMSTAT)
MAKE_CASE(ARMISD::CMOV)
- MAKE_CASE(ARMISD::SUBS)
MAKE_CASE(ARMISD::SSAT)
MAKE_CASE(ARMISD::USAT)
MAKE_CASE(ARMISD::ASRL)
@@ -2189,7 +2218,7 @@ SDValue ARMTargetLowering::LowerCallResult(
SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
- SDValue ThisVal) const {
+ SDValue ThisVal, bool isCmseNSCall) const {
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
@@ -2267,6 +2296,15 @@ SDValue ARMTargetLowering::LowerCallResult(
(VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
+ // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
+ // is less than 32 bits must be sign- or zero-extended after the call for
+ // security reasons. Although the ABI mandates an extension done by the
+ // callee, the latter cannot be trusted to follow the rules of the ABI.
+ const ISD::InputArg &Arg = Ins[VA.getValNo()];
+ if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
+ VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
+ Val = handleCMSEValue(Val, Arg, DAG, dl);
+
InVals.push_back(Val);
}
@@ -2878,7 +2916,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// return.
return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
InVals, isThisReturn,
- isThisReturn ? OutVals[0] : SDValue());
+ isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
}
/// HandleByVal - Every parameter *after* a byval parameter is passed
@@ -4481,8 +4519,6 @@ SDValue ARMTargetLowering::LowerFormalArguments(
*DAG.getContext());
CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
- SmallVector<SDValue, 16> ArgValues;
- SDValue ArgValue;
Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
@@ -4537,6 +4573,7 @@ SDValue ARMTargetLowering::LowerFormalArguments(
// Arguments stored in registers.
if (VA.isRegLoc()) {
EVT RegVT = VA.getLocVT();
+ SDValue ArgValue;
if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
// f64 and vector types are split up into multiple registers or
@@ -4600,16 +4637,6 @@ SDValue ARMTargetLowering::LowerFormalArguments(
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
break;
- case CCValAssign::SExt:
- ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
- DAG.getValueType(VA.getValVT()));
- ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
- break;
- case CCValAssign::ZExt:
- ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
- DAG.getValueType(VA.getValVT()));
- ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
- break;
}
// f16 arguments have their size extended to 4 bytes and passed as if they
@@ -4619,6 +4646,15 @@ SDValue ARMTargetLowering::LowerFormalArguments(
(VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
+ // On CMSE Entry Functions, formal integer arguments whose bitwidth is
+ // less than 32 bits must be sign- or zero-extended in the callee for
+ // security reasons. Although the ABI mandates an extension done by the
+ // caller, the latter cannot be trusted to follow the rules of the ABI.
+ const ISD::InputArg &Arg = Ins[VA.getValNo()];
+ if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
+ RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
+ ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
+
InVals.push_back(ArgValue);
} else { // VA.isRegLoc()
// Only arguments passed on the stack should make it here.
@@ -13709,7 +13745,7 @@ static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
// t2: i64 = build_pair t1, t1:1
// t3: i64 = add t2, y
// Otherwise we try to push the add up above VADDLVAx, to potentially allow
- // the add to be simplified seperately.
+ // the add to be simplified separately.
// We also need to check for sext / zext and commutitive adds.
auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
SDValue NB) {
@@ -18470,9 +18506,9 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
} else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
(!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
// This seems pointless but will allow us to combine it further below.
- // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
+ // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
SDValue Sub =
- DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
+ DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
Sub.getValue(1), SDValue());
Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
@@ -18484,9 +18520,9 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
(!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
// This seems pointless but will allow us to combine it further below
// Note that we change == for != as this is the dual for the case above.
- // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
+ // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
SDValue Sub =
- DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
+ DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
Sub.getValue(1), SDValue());
Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
@@ -18498,21 +18534,21 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
// On Thumb1, the DAG above may be further combined if z is a power of 2
// (z == 2 ^ K).
- // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
+ // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
// t1 = (USUBO (SUB x, y), 1)
// t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
// Result = if K != 0 then (SHL t2:0, K) else t2:0
//
// This also handles the special case of comparing against zero; it's
- // essentially, the same pattern, except there's no SUBS:
+ // essentially, the same pattern, except there's no SUBC:
// CMOV x, z, !=, (CMPZ x, 0) ->
// t1 = (USUBO x, 1)
// t2 = (USUBO_CARRY x, t1:0, t1:1)
// Result = if K != 0 then (SHL t2:0, K) else t2:0
const APInt *TrueConst;
if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
- ((FalseVal.getOpcode() == ARMISD::SUBS &&
- FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
+ ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
+ FalseVal.getOperand(1) == RHS) ||
(FalseVal == LHS && isNullConstant(RHS))) &&
(TrueConst = isPowerOf2Constant(TrueVal))) {
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index ed4df7edd16e..a255e9b6fc36 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -95,7 +95,6 @@ class VectorType;
FMSTAT, // ARM fmstat instruction.
CMOV, // ARM conditional move instructions.
- SUBS, // Flag-setting subtraction.
SSAT, // Signed saturation
USAT, // Unsigned saturation
@@ -244,7 +243,7 @@ class VectorType;
VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask
VADDLVApu,
VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply
- VMLAVu, // them and add the results together, returning an i32 of their sum
+ VMLAVu, // them and add the results together, returning an i32 of the sum
VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask
VMLAVpu,
VMLALVs, // Same as VMLAV but with i64, returning the low and
@@ -895,7 +894,7 @@ class VectorType;
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
- SDValue ThisVal) const;
+ SDValue ThisVal, bool isCmseNSCall) const;
bool supportSplitCSR(MachineFunction *MF) const override {
return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 1f7bd8dd3121..c6bcad8e2a82 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -160,7 +160,6 @@ def ARMintretglue : SDNode<"ARMISD::INTRET_GLUE", SDT_ARMcall,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
[SDNPInGlue]>;
-def ARMsubs : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>;
def ARMssat : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
@@ -3879,14 +3878,6 @@ let isAdd = 1 in
defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>;
defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
-def : ARMPat<(ARMsubs GPR:$Rn, mod_imm:$imm), (SUBSri $Rn, mod_imm:$imm)>;
-def : ARMPat<(ARMsubs GPR:$Rn, GPR:$Rm), (SUBSrr $Rn, $Rm)>;
-def : ARMPat<(ARMsubs GPR:$Rn, so_reg_imm:$shift),
- (SUBSrsi $Rn, so_reg_imm:$shift)>;
-def : ARMPat<(ARMsubs GPR:$Rn, so_reg_reg:$shift),
- (SUBSrsr $Rn, so_reg_reg:$shift)>;
-
-
let isAdd = 1 in
defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>;
defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>;
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index e7f405993513..2ad78f8cd8d4 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1400,12 +1400,6 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
Sched<[WriteALU]>;
}
-
-def : T1Pat<(ARMsubs tGPR:$Rn, tGPR:$Rm), (tSUBSrr $Rn, $Rm)>;
-def : T1Pat<(ARMsubs tGPR:$Rn, imm0_7:$imm3), (tSUBSi3 $Rn, imm0_7:$imm3)>;
-def : T1Pat<(ARMsubs tGPR:$Rn, imm0_255:$imm8), (tSUBSi8 $Rn, imm0_255:$imm8)>;
-
-
// Sign-extend byte
def tSXTB : // A8.6.222
T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index f227d68deeb8..e133dbeba365 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -2438,12 +2438,6 @@ defm t2SUB : T2I_bin_ii12rs<0b101, "sub", sub>;
defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMaddc, 1>;
defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMsubc>;
-def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_imm:$imm),
- (t2SUBSri $Rn, t2_so_imm:$imm)>;
-def : T2Pat<(ARMsubs GPRnopc:$Rn, rGPR:$Rm), (t2SUBSrr $Rn, $Rm)>;
-def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
- (t2SUBSrs $Rn, t2_so_reg:$ShiftedRm)>;
-
defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1, 1>;
defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", ARMsube, 0, 1>;
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 00a29f8ecb23..660f351bae64 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -157,8 +157,10 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
.legalForCartesianProduct({s32, s64}, {s32});
- getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).legalFor({s32});
+ getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_GET_FPMODE})
+ .legalFor({s32});
getActionDefinitionsBuilder(G_RESET_FPENV).alwaysLegal();
+ getActionDefinitionsBuilder(G_SET_FPMODE).customFor({s32});
} else {
getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV})
.libcallFor({s32, s64});
@@ -187,6 +189,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV})
.libcall();
+ getActionDefinitionsBuilder({G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
+ .libcall();
}
// Just expand whatever loads and stores are left.
@@ -439,6 +443,21 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
*ConstantInt::get(Ctx, AsInteger));
break;
}
+ case G_SET_FPMODE: {
+ // New FPSCR = (FPSCR & FPStatusBits) | (Modes & ~FPStatusBits)
+ LLT FPEnvTy = LLT::scalar(32);
+ auto FPEnv = MRI.createGenericVirtualRegister(FPEnvTy);
+ Register Modes = MI.getOperand(0).getReg();
+ MIRBuilder.buildGetFPEnv(FPEnv);
+ auto StatusBitMask = MIRBuilder.buildConstant(FPEnvTy, ARM::FPStatusBits);
+ auto StatusBits = MIRBuilder.buildAnd(FPEnvTy, FPEnv, StatusBitMask);
+ auto NotStatusBitMask =
+ MIRBuilder.buildConstant(FPEnvTy, ~ARM::FPStatusBits);
+ auto FPModeBits = MIRBuilder.buildAnd(FPEnvTy, Modes, NotStatusBitMask);
+ auto NewFPSCR = MIRBuilder.buildOr(FPEnvTy, StatusBits, FPModeBits);
+ MIRBuilder.buildSetFPEnv(NewFPSCR);
+ break;
+ }
}
MI.eraseFromParent();
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 4a5b672f862b..e5e817f1ed9a 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2161,8 +2161,8 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -2186,7 +2186,7 @@ char ARMPreAllocLoadStoreOpt::ID = 0;
INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
@@ -2204,7 +2204,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TII = STI->getInstrInfo();
TRI = STI->getRegisterInfo();
MRI = &Fn.getRegInfo();
- DT = &getAnalysis<MachineDominatorTree>();
+ DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MF = &Fn;
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index a6fdece10ba4..9234881c9407 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -466,6 +466,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case G_GET_FPENV:
case G_SET_FPENV:
+ case G_GET_FPMODE:
OperandsMapping =
getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
break;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index a8c6cd99633f..b66a41d06062 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -68,9 +68,6 @@ static cl::opt<bool>
ForceFastISel("arm-force-fast-isel",
cl::init(false), cl::Hidden);
-static cl::opt<bool> EnableSubRegLiveness("arm-enable-subreg-liveness",
- cl::init(false), cl::Hidden);
-
/// initializeSubtargetDependencies - Initializes using a CPU and feature string
/// so that we can use initializer lists for subtarget initialization.
ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -385,8 +382,6 @@ bool ARMSubtarget::enableMachineScheduler() const {
}
bool ARMSubtarget::enableSubRegLiveness() const {
- if (EnableSubRegLiveness.getNumOccurrences())
- return EnableSubRegLiveness;
// Enable SubRegLiveness for MVE to better optimize s subregs for mqpr regs
// and q subregs for qqqqpr regs.
return hasMVEIntegerOps();
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 7db2e8ee7e6f..4dc4d28724ef 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -199,17 +199,21 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
PatternMatch::m_Value(ArgArg)))) {
return IC.replaceInstUsesWith(II, ArgArg);
}
- if (!II.getMetadata(LLVMContext::MD_range)) {
- Type *IntTy32 = Type::getInt32Ty(II.getContext());
- Metadata *M[] = {
- ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
- ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
- II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
- II.setMetadata(LLVMContext::MD_noundef,
- MDNode::get(II.getContext(), std::nullopt));
- return &II;
+
+ if (II.getMetadata(LLVMContext::MD_range))
+ break;
+
+ ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
+
+ if (auto CurrentRange = II.getRange()) {
+ Range = Range.intersectWith(*CurrentRange);
+ if (Range == CurrentRange)
+ break;
}
- break;
+
+ II.addRangeRetAttr(Range);
+ II.addRetAttr(Attribute::NoUndef);
+ return &II;
}
case Intrinsic::arm_mve_vadc:
case Intrinsic::arm_mve_vadc_predicated: {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index de7449a400a7..50a59ce76763 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -328,8 +328,9 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx,
// execute-only section in the object.
MCSectionELF *TextSection =
static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
- if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions()) {
- for (auto &F : TextSection->getFragmentList())
+ bool IsExecOnly = Sec.getFlags() & ELF::SHF_ARM_PURECODE;
+ if (IsExecOnly && !TextSection->hasInstructions()) {
+ for (auto &F : *TextSection)
if (auto *DF = dyn_cast<MCDataFragment>(&F))
if (!DF->getContents().empty())
return;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index afd7dccbeca9..31b577b9c301 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -426,6 +426,8 @@ private:
// Reset state between object emissions
void reset() override;
+ void finish() override;
+
public:
ARMTargetELFStreamer(MCStreamer &S)
: ARMTargetStreamer(S), CurrentVendor("aeabi") {}
@@ -459,8 +461,6 @@ public:
~ARMELFStreamer() override = default;
- void finishImpl() override;
-
// ARM exception handling directives
void emitFnStart();
void emitFnEnd();
@@ -479,7 +479,7 @@ public:
MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
}
- void changeSection(MCSection *Section, const MCExpr *Subsection) override {
+ void changeSection(MCSection *Section, uint32_t Subsection) override {
LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo);
MCELFStreamer::changeSection(Section, Subsection);
auto LastMappingSymbol = LastMappingSymbols.find(Section);
@@ -487,7 +487,7 @@ public:
LastEMSInfo = std::move(LastMappingSymbol->second);
return;
}
- LastEMSInfo.reset(new ElfMappingSymbolInfo(SMLoc(), nullptr, 0));
+ LastEMSInfo.reset(new ElfMappingSymbolInfo);
}
/// This function is the one used to emit instruction data into the ELF
@@ -555,7 +555,7 @@ public:
if (!LastEMSInfo->hasInfo())
return;
ElfMappingSymbolInfo *EMS = LastEMSInfo.get();
- EmitMappingSymbol("$d", EMS->Loc, EMS->F, EMS->Offset);
+ emitMappingSymbol("$d", *EMS->F, EMS->Offset);
EMS->resetInfo();
}
@@ -625,17 +625,14 @@ private:
};
struct ElfMappingSymbolInfo {
- explicit ElfMappingSymbolInfo(SMLoc Loc, MCFragment *F, uint64_t O)
- : Loc(Loc), F(F), Offset(O), State(EMS_None) {}
void resetInfo() {
F = nullptr;
Offset = 0;
}
bool hasInfo() { return F != nullptr; }
- SMLoc Loc;
- MCFragment *F;
- uint64_t Offset;
- ElfMappingSymbol State;
+ MCDataFragment *F = nullptr;
+ uint64_t Offset = 0;
+ ElfMappingSymbol State = EMS_None;
};
void emitDataMappingSymbol() {
@@ -648,8 +645,7 @@ private:
auto *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
if (!DF)
return;
- EMS->Loc = SMLoc();
- EMS->F = getCurrentFragment();
+ EMS->F = DF;
EMS->Offset = DF->getContents().size();
LastEMSInfo->State = EMS_Data;
return;
@@ -683,11 +679,10 @@ private:
Symbol->setBinding(ELF::STB_LOCAL);
}
- void EmitMappingSymbol(StringRef Name, SMLoc Loc, MCFragment *F,
- uint64_t Offset) {
+ void emitMappingSymbol(StringRef Name, MCDataFragment &F, uint64_t Offset) {
auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
Name + "." + Twine(MappingSymbolCounter++)));
- emitLabelAtPos(Symbol, Loc, F, Offset);
+ emitLabelAtPos(Symbol, SMLoc(), F, Offset);
Symbol->setType(ELF::STT_NOTYPE);
Symbol->setBinding(ELF::STB_LOCAL);
}
@@ -1118,12 +1113,9 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; }
-void ARMELFStreamer::finishImpl() {
- MCTargetStreamer &TS = *getTargetStreamer();
- ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
- ATS.finishAttributeSection();
-
- MCELFStreamer::finishImpl();
+void ARMTargetELFStreamer::finish() {
+ ARMTargetStreamer::finish();
+ finishAttributeSection();
}
void ARMELFStreamer::reset() {
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index c9bbc41ac13b..4882e8533caf 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -59,8 +59,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -93,7 +93,7 @@ INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE,
"ARM MVE TailPred and VPT Optimisations pass", false,
false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE,
"ARM MVE TailPred and VPT Optimisations pass", false, false)
@@ -1065,7 +1065,8 @@ bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
MRI = &Fn.getRegInfo();
MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>();
- MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree *DT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
<< "********** Function: " << Fn.getName() << '\n');
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 7732cb08e24d..8346b1c119ee 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -1346,11 +1346,11 @@ static void analyzeReturnValues(const SmallVectorImpl<ArgT> &Args,
ArrayRef<MCPhysReg> RegList8;
ArrayRef<MCPhysReg> RegList16;
if (Tiny) {
- RegList8 = ArrayRef(RegList8Tiny, std::size(RegList8Tiny));
- RegList16 = ArrayRef(RegList16Tiny, std::size(RegList16Tiny));
+ RegList8 = ArrayRef(RegList8Tiny);
+ RegList16 = ArrayRef(RegList16Tiny);
} else {
- RegList8 = ArrayRef(RegList8AVR, std::size(RegList8AVR));
- RegList16 = ArrayRef(RegList16AVR, std::size(RegList16AVR));
+ RegList8 = ArrayRef(RegList8AVR);
+ RegList16 = ArrayRef(RegList16AVR);
}
// GCC-ABI says that the size is rounded up to the next even number,
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 88b1989ef917..6cfbf9c83dc3 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -761,28 +761,16 @@ let isBranch = 1, isTerminator = 1 in {
// Conditional skipping on GPR register bits, and
// conditional skipping on IO register bits.
let isBarrier = 1 in {
- def SBRCRrB : FRdB<0b10, (outs),
- (ins GPR8
- : $rd, i8imm
- : $b),
- "sbrc\t$rd, $b", []>;
+ def SBRCRrB : FRdB<0b10, (outs), (ins GPR8:$rd, i8imm:$b), "sbrc\t$rd, $b",
+ []>;
- def SBRSRrB : FRdB<0b11, (outs),
- (ins GPR8
- : $rd, i8imm
- : $b),
- "sbrs\t$rd, $b", []>;
-
- def SBICAb : FIOBIT<0b01, (outs),
- (ins imm_port5
- : $addr, i8imm
- : $b),
+ def SBRSRrB : FRdB<0b11, (outs), (ins GPR8:$rd, i8imm:$b), "sbrs\t$rd, $b",
+ []>;
+
+ def SBICAb : FIOBIT<0b01, (outs), (ins imm_port5:$addr, i8imm:$b),
"sbic\t$addr, $b", []>;
- def SBISAb : FIOBIT<0b11, (outs),
- (ins imm_port5
- : $addr, i8imm
- : $b),
+ def SBISAb : FIOBIT<0b11, (outs), (ins imm_port5:$addr, i8imm:$b),
"sbis\t$addr, $b", []>;
}
@@ -790,18 +778,12 @@ let isBranch = 1, isTerminator = 1 in {
let Uses = [SREG] in {
// BRBS s, k
// Branch if `s` flag in status register is set.
- def BRBSsk : FSK<0, (outs),
- (ins i8imm
- : $s, relbrtarget_7
- : $k),
+ def BRBSsk : FSK<0, (outs), (ins i8imm:$s, relbrtarget_7:$k),
"brbs\t$s, $k", []>;
// BRBC s, k
// Branch if `s` flag in status register is clear.
- def BRBCsk : FSK<1, (outs),
- (ins i8imm
- : $s, relbrtarget_7
- : $k),
+ def BRBCsk : FSK<1, (outs), (ins i8imm:$s, relbrtarget_7:$k),
"brbc\t$s, $k", []>;
}
}
@@ -852,53 +834,29 @@ def : InstAlias<"brid\t$k", (BRBCsk 7, relbrtarget_7 : $k)>;
// Based on status register. We cannot simplify these into instruction aliases
// because we also need to be able to specify a pattern to match for ISel.
let isBranch = 1, isTerminator = 1, Uses = [SREG] in {
- def BREQk : FBRsk<0, 0b001, (outs),
- (ins relbrtarget_7
- : $k),
- "breq\t$k", [(AVRbrcond bb
- : $k, AVR_COND_EQ)]>;
-
- def BRNEk : FBRsk<1, 0b001, (outs),
- (ins relbrtarget_7
- : $k),
- "brne\t$k", [(AVRbrcond bb
- : $k, AVR_COND_NE)]>;
-
- def BRSHk : FBRsk<1, 0b000, (outs),
- (ins relbrtarget_7
- : $k),
- "brsh\t$k", [(AVRbrcond bb
- : $k, AVR_COND_SH)]>;
-
- def BRLOk : FBRsk<0, 0b000, (outs),
- (ins relbrtarget_7
- : $k),
- "brlo\t$k", [(AVRbrcond bb
- : $k, AVR_COND_LO)]>;
-
- def BRMIk : FBRsk<0, 0b010, (outs),
- (ins relbrtarget_7
- : $k),
- "brmi\t$k", [(AVRbrcond bb
- : $k, AVR_COND_MI)]>;
-
- def BRPLk : FBRsk<1, 0b010, (outs),
- (ins relbrtarget_7
- : $k),
- "brpl\t$k", [(AVRbrcond bb
- : $k, AVR_COND_PL)]>;
-
- def BRGEk : FBRsk<1, 0b100, (outs),
- (ins relbrtarget_7
- : $k),
- "brge\t$k", [(AVRbrcond bb
- : $k, AVR_COND_GE)]>;
-
- def BRLTk : FBRsk<0, 0b100, (outs),
- (ins relbrtarget_7
- : $k),
- "brlt\t$k", [(AVRbrcond bb
- : $k, AVR_COND_LT)]>;
+ def BREQk : FBRsk<0, 0b001, (outs), (ins relbrtarget_7:$k), "breq\t$k",
+ [(AVRbrcond bb:$k, AVR_COND_EQ)]>;
+
+ def BRNEk : FBRsk<1, 0b001, (outs), (ins relbrtarget_7:$k), "brne\t$k",
+ [(AVRbrcond bb:$k, AVR_COND_NE)]>;
+
+ def BRSHk : FBRsk<1, 0b000, (outs), (ins relbrtarget_7:$k), "brsh\t$k",
+ [(AVRbrcond bb:$k, AVR_COND_SH)]>;
+
+ def BRLOk : FBRsk<0, 0b000, (outs), (ins relbrtarget_7:$k), "brlo\t$k",
+ [(AVRbrcond bb:$k, AVR_COND_LO)]>;
+
+ def BRMIk : FBRsk<0, 0b010, (outs), (ins relbrtarget_7:$k), "brmi\t$k",
+ [(AVRbrcond bb:$k, AVR_COND_MI)]>;
+
+ def BRPLk : FBRsk<1, 0b010, (outs), (ins relbrtarget_7:$k), "brpl\t$k",
+ [(AVRbrcond bb:$k, AVR_COND_PL)]>;
+
+ def BRGEk : FBRsk<1, 0b100, (outs), (ins relbrtarget_7:$k), "brge\t$k",
+ [(AVRbrcond bb:$k, AVR_COND_GE)]>;
+
+ def BRLTk : FBRsk<0, 0b100, (outs), (ins relbrtarget_7:$k), "brlt\t$k",
+ [(AVRbrcond bb:$k, AVR_COND_LT)]>;
}
//===----------------------------------------------------------------------===//
@@ -906,62 +864,37 @@ let isBranch = 1, isTerminator = 1, Uses = [SREG] in {
//===----------------------------------------------------------------------===//
// 8 and 16-bit register move instructions.
let hasSideEffects = 0 in {
- def MOVRdRr : FRdRr<0b0010, 0b11,
- (outs GPR8
- : $rd),
- (ins GPR8
- : $rr),
+ def MOVRdRr : FRdRr<0b0010, 0b11, (outs GPR8:$rd), (ins GPR8:$rr),
"mov\t$rd, $rr", []>;
- def MOVWRdRr : FMOVWRdRr<(outs DREGS
- : $rd),
- (ins DREGS
- : $rr),
- "movw\t$rd, $rr", []>,
+ def MOVWRdRr : FMOVWRdRr<(outs DREGS:$rd), (ins DREGS:$rr), "movw\t$rd, $rr",
+ []>,
Requires<[HasMOVW]>;
}
// Load immediate values into registers.
let isReMaterializable = 1 in {
- def LDIRdK : FRdK<0b1110,
- (outs LD8
- : $rd),
- (ins imm_ldi8
- : $k),
- "ldi\t$rd, $k", [(set i8
- : $rd, imm
- : $k)]>;
+ def LDIRdK : FRdK<0b1110, (outs LD8:$rd), (ins imm_ldi8:$k), "ldi\t$rd, $k",
+ [(set i8:$rd, imm:$k)]>;
// LDIW Rd+1:Rd, K+1:K
//
// Expands to:
// ldi Rd, K
// ldi Rd+1, K+1
- def LDIWRdK : Pseudo<(outs DLDREGS
- : $dst),
- (ins i16imm
- : $src),
- "ldiw\t$dst, $src", [(set i16
- : $dst, imm
- : $src)]>;
+ def LDIWRdK : Pseudo<(outs DLDREGS:$dst), (ins i16imm:$src),
+ "ldiw\t$dst, $src", [(set i16:$dst, imm:$src)]>;
}
// Load from data space into register.
let canFoldAsLoad = 1, isReMaterializable = 1 in {
- def LDSRdK : F32DM<0b0,
- (outs GPR8
- : $rd),
- (ins imm16
- : $k),
- "lds\t$rd, $k", [(set i8
- : $rd, (load imm
- : $k))]>,
+ def LDSRdK : F32DM<0b0, (outs GPR8:$rd), (ins imm16:$k), "lds\t$rd, $k",
+ [(set i8:$rd, (load imm:$k))]>,
Requires<[HasSRAM, HasNonTinyEncoding]>;
// Load from data space into register, which is only available on AVRTiny.
def LDSRdKTiny : FLDSSTSTINY<0b0, (outs LD8:$rd), (ins imm7tiny:$k),
- "lds\t$rd, $k",
- [(set i8:$rd, (load imm:$k))]>,
+ "lds\t$rd, $k", [(set i8:$rd, (load imm:$k))]>,
Requires<[HasSRAM, HasTinyEncoding]>;
// LDSW Rd+1:Rd, K+1:K
@@ -969,26 +902,16 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
// Expands to:
// lds Rd, (K+1:K)
// lds Rd+1 (K+1:K) + 1
- def LDSWRdK : Pseudo<(outs DREGS
- : $dst),
- (ins i16imm
- : $src),
- "ldsw\t$dst, $src", [(set i16
- : $dst, (load imm
- : $src))]>,
+ def LDSWRdK : Pseudo<(outs DREGS:$dst), (ins i16imm:$src), "ldsw\t$dst, $src",
+ [(set i16:$dst, (load imm:$src))]>,
Requires<[HasSRAM, HasNonTinyEncoding]>;
}
// Indirect loads.
let canFoldAsLoad = 1, isReMaterializable = 1 in {
- def LDRdPtr : FSTLD<0, 0b00,
- (outs GPR8
- : $reg),
- (ins LDSTPtrReg
- : $ptrreg),
- "ld\t$reg, $ptrreg", [(set GPR8
- : $reg, (load i16
- : $ptrreg))]>,
+ def LDRdPtr : FSTLD<0, 0b00, (outs GPR8:$reg), (ins LDSTPtrReg:$ptrreg),
+ "ld\t$reg, $ptrreg",
+ [(set GPR8:$reg, (load i16:$ptrreg))]>,
Requires<[HasSRAM]>;
// LDW Rd+1:Rd, P
@@ -1001,13 +924,8 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
// ld Rd+1, P+
// subiw P, 2
let Constraints = "@earlyclobber $reg" in def LDWRdPtr
- : Pseudo<(outs DREGS
- : $reg),
- (ins PTRDISPREGS
- : $ptrreg),
- "ldw\t$reg, $ptrreg", [(set i16
- : $reg, (load i16
- : $ptrreg))]>,
+ : Pseudo<(outs DREGS:$reg), (ins PTRDISPREGS:$ptrreg),
+ "ldw\t$reg, $ptrreg", [(set i16:$reg, (load i16:$ptrreg))]>,
Requires<[HasSRAM]>;
}
@@ -1027,21 +945,12 @@ let mayLoad = 1, hasSideEffects = 0,
// Expands to:
// ld Rd, P+
// ld Rd+1, P+
- def LDWRdPtrPi : Pseudo<(outs DREGS
- : $reg, PTRREGS
- : $base_wb),
- (ins PTRREGS
- : $ptrreg),
- "ldw\t$reg, $ptrreg+", []>,
+ def LDWRdPtrPi : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+ (ins PTRREGS:$ptrreg), "ldw\t$reg, $ptrreg+", []>,
Requires<[HasSRAM]>;
- def LDRdPtrPd : FSTLD<0, 0b10,
- (outs GPR8
- : $reg, PTRREGS
- : $base_wb),
- (ins LDSTPtrReg
- : $ptrreg),
- "ld\t$reg, -$ptrreg", []>,
+ def LDRdPtrPd : FSTLD<0, 0b10, (outs GPR8:$reg, PTRREGS:$base_wb),
+ (ins LDSTPtrReg:$ptrreg), "ld\t$reg, -$ptrreg", []>,
Requires<[HasSRAM]>;
// LDW Rd+1:Rd, -P
@@ -1049,27 +958,17 @@ let mayLoad = 1, hasSideEffects = 0,
// Expands to:
// ld Rd+1, -P
// ld Rd, -P
- def LDWRdPtrPd : Pseudo<(outs DREGS
- : $reg, PTRREGS
- : $base_wb),
- (ins PTRREGS
- : $ptrreg),
- "ldw\t$reg, -$ptrreg", []>,
+ def LDWRdPtrPd : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+ (ins PTRREGS:$ptrreg), "ldw\t$reg, -$ptrreg", []>,
Requires<[HasSRAM]>;
}
// Load indirect with displacement operations.
let canFoldAsLoad = 1, isReMaterializable = 1 in {
- def LDDRdPtrQ
- : FSTDLDD<0,
- (outs GPR8
- : $reg),
- (ins memri
- : $memri),
- "ldd\t$reg, $memri", [(set i8
- : $reg, (load addr
- : $memri))]>,
- Requires<[HasSRAM, HasNonTinyEncoding]>;
+ def LDDRdPtrQ : FSTDLDD<0, (outs GPR8:$reg), (ins memri:$memri),
+ "ldd\t$reg, $memri",
+ [(set i8:$reg, (load addr:$memri))]>,
+ Requires<[HasSRAM, HasNonTinyEncoding]>;
// LDDW Rd+1:Rd, P+q
//
@@ -1081,15 +980,11 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
// ld Rd, P+
// ld Rd+1, P+
// subiw P, q+2
- let Constraints = "@earlyclobber $dst" in def LDDWRdPtrQ
- : Pseudo<(outs DREGS
- : $dst),
- (ins memri
- : $memri),
- "lddw\t$dst, $memri", [(set i16
- : $dst, (load addr
- : $memri))]>,
- Requires<[HasSRAM]>;
+ let Constraints = "@earlyclobber $dst" in
+ def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri),
+ "lddw\t$dst, $memri",
+ [(set i16:$dst, (load addr:$memri))]>,
+ Requires<[HasSRAM]>;
// An identical pseudo instruction to LDDWRdPtrQ, expect restricted to the Y
// register and without the @earlyclobber flag.
@@ -1107,35 +1002,23 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
// The pseudo expansion pass trivially expands this into LDDWRdPtrQ.
//
// This instruction may be removed once PR13375 is fixed.
- let mayLoad = 1,
- hasSideEffects = 0 in def LDDWRdYQ : Pseudo<(outs DREGS
- : $dst),
- (ins memri
- : $memri),
- "lddw\t$dst, $memri", []>,
- Requires<[HasSRAM]>;
+ let mayLoad = 1, hasSideEffects = 0 in
+ def LDDWRdYQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri),
+ "lddw\t$dst, $memri", []>,
+ Requires<[HasSRAM]>;
}
class AtomicLoad<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
- : Pseudo<(outs DRC
- : $rd),
- (ins PTRRC
- : $rr),
- "atomic_op", [(set DRC
- : $rd, (Op i16
- : $rr))]>;
+ : Pseudo<(outs DRC:$rd), (ins PTRRC:$rr), "atomic_op",
+ [(set DRC:$rd, (Op i16:$rr))]>;
class AtomicStore<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
- : Pseudo<(outs),
- (ins PTRRC
- : $rd, DRC
- : $rr),
- "atomic_op", [(Op DRC:$rr, i16:$rd)]>;
+ : Pseudo<(outs), (ins PTRRC:$rd, DRC:$rr), "atomic_op",
+ [(Op DRC:$rr, i16:$rd)]>;
class AtomicLoadOp<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
- : Pseudo<(outs DRC:$rd),
- (ins PTRRC:$rr, DRC:$operand),
- "atomic_op", [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>;
+ : Pseudo<(outs DRC:$rd), (ins PTRRC:$rr, DRC:$operand), "atomic_op",
+ [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>;
// Atomic instructions
// ===================
@@ -1156,28 +1039,24 @@ class AtomicLoadOp8<PatFrag Op> : AtomicLoadOp<Op, GPR8, PTRREGS>;
class AtomicLoadOp16<PatFrag Op> : AtomicLoadOp<Op, DREGS, PTRDISPREGS>;
let usesCustomInserter=1 in {
- def AtomicLoadAdd8 : AtomicLoadOp8<atomic_load_add_8>;
- def AtomicLoadAdd16 : AtomicLoadOp16<atomic_load_add_16>;
- def AtomicLoadSub8 : AtomicLoadOp8<atomic_load_sub_8>;
- def AtomicLoadSub16 : AtomicLoadOp16<atomic_load_sub_16>;
- def AtomicLoadAnd8 : AtomicLoadOp8<atomic_load_and_8>;
- def AtomicLoadAnd16 : AtomicLoadOp16<atomic_load_and_16>;
- def AtomicLoadOr8 : AtomicLoadOp8<atomic_load_or_8>;
- def AtomicLoadOr16 : AtomicLoadOp16<atomic_load_or_16>;
- def AtomicLoadXor8 : AtomicLoadOp8<atomic_load_xor_8>;
- def AtomicLoadXor16 : AtomicLoadOp16<atomic_load_xor_16>;
+ def AtomicLoadAdd8 : AtomicLoadOp8<atomic_load_add_i8>;
+ def AtomicLoadAdd16 : AtomicLoadOp16<atomic_load_add_i16>;
+ def AtomicLoadSub8 : AtomicLoadOp8<atomic_load_sub_i8>;
+ def AtomicLoadSub16 : AtomicLoadOp16<atomic_load_sub_i16>;
+ def AtomicLoadAnd8 : AtomicLoadOp8<atomic_load_and_i8>;
+ def AtomicLoadAnd16 : AtomicLoadOp16<atomic_load_and_i16>;
+ def AtomicLoadOr8 : AtomicLoadOp8<atomic_load_or_i8>;
+ def AtomicLoadOr16 : AtomicLoadOp16<atomic_load_or_i16>;
+ def AtomicLoadXor8 : AtomicLoadOp8<atomic_load_xor_i8>;
+ def AtomicLoadXor16 : AtomicLoadOp16<atomic_load_xor_i16>;
}
+
def AtomicFence
: Pseudo<(outs), (ins), "atomic_fence", [(atomic_fence timm, timm)]>;
// Indirect store from register to data space.
-def STSKRr : F32DM<0b1, (outs),
- (ins imm16
- : $k, GPR8
- : $rd),
- "sts\t$k, $rd", [(store i8
- : $rd, imm
- : $k)]>,
+def STSKRr : F32DM<0b1, (outs), (ins imm16:$k, GPR8:$rd), "sts\t$k, $rd",
+ [(store i8:$rd, imm:$k)]>,
Requires<[HasSRAM, HasNonTinyEncoding]>;
// Store from register to data space, which is only available on AVRTiny.
@@ -1190,25 +1069,15 @@ def STSKRrTiny : FLDSSTSTINY<0b1, (outs), (ins imm7tiny:$k, LD8:$rd),
// Expands to:
// sts Rr+1, (K+1:K) + 1
// sts Rr, (K+1:K)
-def STSWKRr : Pseudo<(outs),
- (ins i16imm
- : $dst, DREGS
- : $src),
- "stsw\t$dst, $src", [(store i16
- : $src, imm
- : $dst)]>,
+def STSWKRr : Pseudo<(outs), (ins i16imm:$dst, DREGS:$src),
+ "stsw\t$dst, $src", [(store i16:$src, imm:$dst)]>,
Requires<[HasSRAM, HasNonTinyEncoding]>;
// Indirect stores.
// ST P, Rr
// Stores the value of Rr into the location addressed by pointer P.
-def STPtrRr : FSTLD<1, 0b00, (outs),
- (ins LDSTPtrReg
- : $ptrreg, GPR8
- : $reg),
- "st\t$ptrreg, $reg", [(store GPR8
- : $reg, i16
- : $ptrreg)]>,
+def STPtrRr : FSTLD<1, 0b00, (outs), (ins LDSTPtrReg:$ptrreg, GPR8:$reg),
+ "st\t$ptrreg, $reg", [(store GPR8:$reg, i16:$ptrreg)]>,
Requires<[HasSRAM]>;
// STW P, Rr+1:Rr
@@ -1221,13 +1090,8 @@ def STPtrRr : FSTLD<1, 0b00, (outs),
// st P+, Rr
// st P+, Rr+1
// subiw P, q+2
-def STWPtrRr : Pseudo<(outs),
- (ins PTRDISPREGS
- : $ptrreg, DREGS
- : $reg),
- "stw\t$ptrreg, $reg", [(store i16
- : $reg, i16
- : $ptrreg)]>,
+def STWPtrRr : Pseudo<(outs), (ins PTRDISPREGS:$ptrreg, DREGS:$reg),
+ "stw\t$ptrreg, $reg", [(store i16:$reg, i16:$ptrreg)]>,
Requires<[HasSRAM]>;
// Indirect stores (with postincrement or predecrement).
@@ -1236,18 +1100,11 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in {
// ST P+, Rr
// Stores the value of Rr into the location addressed by pointer P.
// Post increments P.
- def STPtrPiRr : FSTLD<1, 0b01,
- (outs LDSTPtrReg
- : $base_wb),
- (ins LDSTPtrReg
- : $ptrreg, GPR8
- : $reg, i8imm
- : $offs),
- "st\t$ptrreg+, $reg", [(set i16
- : $base_wb, (post_store GPR8
- : $reg, i16
- : $ptrreg, imm
- : $offs))]>,
+ def STPtrPiRr : FSTLD<1, 0b01, (outs LDSTPtrReg:$base_wb),
+ (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+ "st\t$ptrreg+, $reg",
+ [(set i16:$base_wb, (post_store GPR8:$reg, i16:$ptrreg,
+ imm:$offs))]>,
Requires<[HasSRAM]>;
// STW P+, Rr+1:Rr
@@ -1257,34 +1114,22 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in {
// Expands to:
// st P+, Rr
// st P+, Rr+1
- def STWPtrPiRr : Pseudo<(outs PTRREGS
- : $base_wb),
- (ins PTRREGS
- : $ptrreg, DREGS
- : $trh, i8imm
- : $offs),
- "stw\t$ptrreg+, $trh", [(set PTRREGS
- : $base_wb, (post_store DREGS
- : $trh, PTRREGS
- : $ptrreg, imm
- : $offs))]>,
+ def STWPtrPiRr : Pseudo<(outs PTRREGS:$base_wb),
+ (ins PTRREGS:$ptrreg, DREGS:$trh, i8imm:$offs),
+ "stw\t$ptrreg+, $trh",
+ [(set PTRREGS:$base_wb,
+ (post_store DREGS:$trh, PTRREGS:$ptrreg,
+ imm:$offs))]>,
Requires<[HasSRAM]>;
// ST -P, Rr
// Stores the value of Rr into the location addressed by pointer P.
// Pre decrements P.
- def STPtrPdRr : FSTLD<1, 0b10,
- (outs LDSTPtrReg
- : $base_wb),
- (ins LDSTPtrReg
- : $ptrreg, GPR8
- : $reg, i8imm
- : $offs),
- "st\t-$ptrreg, $reg", [(set i16
- : $base_wb, (pre_store GPR8
- : $reg, i16
- : $ptrreg, imm
- : $offs))]>,
+ def STPtrPdRr : FSTLD<1, 0b10, (outs LDSTPtrReg:$base_wb),
+ (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+ "st\t-$ptrreg, $reg",
+ [(set i16: $base_wb,
+ (pre_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
Requires<[HasSRAM]>;
// STW -P, Rr+1:Rr
@@ -1294,17 +1139,11 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in {
// Expands to:
// st -P, Rr+1
// st -P, Rr
- def STWPtrPdRr : Pseudo<(outs PTRREGS
- : $base_wb),
- (ins PTRREGS
- : $ptrreg, DREGS
- : $reg, i8imm
- : $offs),
- "stw\t-$ptrreg, $reg", [(set PTRREGS
- : $base_wb, (pre_store i16
- : $reg, i16
- : $ptrreg, imm
- : $offs))]>,
+ def STWPtrPdRr : Pseudo<(outs PTRREGS:$base_wb),
+ (ins PTRREGS:$ptrreg, DREGS:$reg, i8imm:$offs),
+ "stw\t-$ptrreg, $reg",
+ [(set PTRREGS:$base_wb,
+ (pre_store i16:$reg, i16:$ptrreg, imm:$offs))]>,
Requires<[HasSRAM]>;
}
@@ -1312,13 +1151,8 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in {
// STD P+q, Rr
// Stores the value of Rr into the location addressed by pointer P with a
// displacement of q. Does not modify P.
-def STDPtrQRr : FSTDLDD<1, (outs),
- (ins memri
- : $memri, GPR8
- : $reg),
- "std\t$memri, $reg", [(store i8
- : $reg, addr
- : $memri)]>,
+def STDPtrQRr : FSTDLDD<1, (outs), (ins memri:$memri, GPR8:$reg),
+ "std\t$memri, $reg", [(store i8:$reg, addr:$memri)]>,
Requires<[HasSRAM, HasNonTinyEncoding]>;
// STDW P+q, Rr+1:Rr
@@ -1333,13 +1167,8 @@ def STDPtrQRr : FSTDLDD<1, (outs),
// st P+, Rr
// st P+, Rr+1
// subiw P, q+2
-def STDWPtrQRr : Pseudo<(outs),
- (ins memri
- : $memri, DREGS
- : $src),
- "stdw\t$memri, $src", [(store i16
- : $src, addr
- : $memri)]>,
+def STDWPtrQRr : Pseudo<(outs), (ins memri:$memri, DREGS:$src),
+ "stdw\t$memri, $src", [(store i16:$src, addr:$memri)]>,
Requires<[HasSRAM]>;
// Load program memory operations.
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 66c57952a7f1..55989f5eb6a3 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -807,7 +807,7 @@ class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
let Constraints = "$dst = $val" in {
let Predicates = [BPFNoALU32] in {
- def XADDW : XADD<BPF_W, "u32", atomic_load_add_32>;
+ def XADDW : XADD<BPF_W, "u32", atomic_load_add_i32>;
}
}
@@ -897,23 +897,23 @@ class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
let Constraints = "$dst = $val" in {
let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
- def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_32>;
- def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_32>;
- def XFORW32 : XFALU32<BPF_W, BPF_OR, "u32", "or", atomic_load_or_32>;
- def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_32>;
+ def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32>;
+ def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_i32>;
+ def XFORW32 : XFALU32<BPF_W, BPF_OR, "u32", "or", atomic_load_or_i32>;
+ def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_i32>;
}
- def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_64>;
- def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_64>;
- def XFORD : XFALU64<BPF_DW, BPF_OR, "u64", "or", atomic_load_or_64>;
- def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_64>;
+ def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64>;
+ def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_i64>;
+ def XFORD : XFALU64<BPF_DW, BPF_OR, "u64", "or", atomic_load_or_i64>;
+ def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_i64>;
}
// atomic_load_sub can be represented as a neg followed
// by an atomic_load_add.
-def : Pat<(atomic_load_sub_32 ADDRri:$addr, GPR32:$val),
+def : Pat<(atomic_load_sub_i32 ADDRri:$addr, GPR32:$val),
(XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
-def : Pat<(atomic_load_sub_64 ADDRri:$addr, GPR:$val),
+def : Pat<(atomic_load_sub_i64 ADDRri:$addr, GPR:$val),
(XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
// Atomic Exchange
@@ -953,10 +953,10 @@ class XCHG32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
let Constraints = "$dst = $val" in {
let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
- def XCHGW32 : XCHG32<BPF_W, "32", atomic_swap_32>;
+ def XCHGW32 : XCHG32<BPF_W, "32", atomic_swap_i32>;
}
- def XCHGD : XCHG<BPF_DW, "64", atomic_swap_64>;
+ def XCHGD : XCHG<BPF_DW, "64", atomic_swap_i64>;
}
// Compare-And-Exchange
@@ -996,11 +996,11 @@ class CMPXCHG32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
let Predicates = [BPFHasALU32], Defs = [W0], Uses = [W0],
DecoderNamespace = "BPFALU32" in {
- def CMPXCHGW32 : CMPXCHG32<BPF_W, "32", atomic_cmp_swap_32>;
+ def CMPXCHGW32 : CMPXCHG32<BPF_W, "32", atomic_cmp_swap_i32>;
}
let Defs = [R0], Uses = [R0] in {
- def CMPXCHGD : CMPXCHG<BPF_DW, "64", atomic_cmp_swap_64>;
+ def CMPXCHGD : CMPXCHG<BPF_DW, "64", atomic_cmp_swap_i64>;
}
// bswap16, bswap32, bswap64
diff --git a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
index 8761e4aa258c..84af6806abb3 100644
--- a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -47,9 +47,17 @@ BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
-static void WarnSize(int Offset, MachineFunction &MF, DebugLoc& DL)
-{
+static void WarnSize(int Offset, MachineFunction &MF, DebugLoc& DL,
+ MachineBasicBlock& MBB) {
if (Offset <= -BPFStackSizeOption) {
+ if (!DL)
+ /* try harder to get some debug loc */
+ for (auto &I : MBB)
+ if (I.getDebugLoc()) {
+ DL = I.getDebugLoc();
+ break;
+ }
+
const Function &F = MF.getFunction();
DiagnosticInfoUnsupported DiagStackSize(
F,
@@ -73,14 +81,6 @@ bool BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MachineFunction &MF = *MBB.getParent();
DebugLoc DL = MI.getDebugLoc();
- if (!DL)
- /* try harder to get some debug loc */
- for (auto &I : MBB)
- if (I.getDebugLoc()) {
- DL = I.getDebugLoc();
- break;
- }
-
while (!MI.getOperand(i).isFI()) {
++i;
assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
@@ -93,7 +93,7 @@ bool BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (MI.getOpcode() == BPF::MOV_rr) {
int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
- WarnSize(Offset, MF, DL);
+ WarnSize(Offset, MF, DL, MBB);
MI.getOperand(i).ChangeToRegister(FrameReg, false);
Register reg = MI.getOperand(i - 1).getReg();
BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg)
@@ -108,7 +108,7 @@ bool BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (!isInt<32>(Offset))
llvm_unreachable("bug in frame offset");
- WarnSize(Offset, MF, DL);
+ WarnSize(Offset, MF, DL, MBB);
if (MI.getOpcode() == BPF::FI_ri) {
// architecture does not really support FI_ri, replace it with
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 7b73c9f4a1e4..7d91fa8bb824 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -113,8 +113,7 @@ static Expected<bool> parseBPFPreserveStaticOffsetOptions(StringRef Params) {
"BPFPreserveStaticOffsetPass");
}
-void BPFTargetMachine::registerPassBuilderCallbacks(
- PassBuilder &PB, bool PopulateClassToPassNames) {
+void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "BPFPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.h b/llvm/lib/Target/BPF/BPFTargetMachine.h
index 0a28394463b2..4e6adc722e76 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -42,8 +42,7 @@ public:
return TLOF.get();
}
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool PopulateClassToPassNames) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
};
}
diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
index 4acdd571f6c9..97bdd4c45a8c 100644
--- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
+++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
@@ -218,7 +218,7 @@ public:
bool runOnMachineFunction(MachineFunction &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 67e04c212a69..f49ccfe01f72 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -37,6 +37,8 @@ class DXContainerGlobals : public llvm::ModulePass {
GlobalVariable *buildSignature(Module &M, Signature &Sig, StringRef Name,
StringRef SectionName);
void addSignature(Module &M, SmallVector<GlobalValue *> &Globals);
+ void addPipelineStateValidationInfo(Module &M,
+ SmallVector<GlobalValue *> &Globals);
public:
static char ID; // Pass identification, replacement for typeid
@@ -63,6 +65,7 @@ bool DXContainerGlobals::runOnModule(Module &M) {
Globals.push_back(getFeatureFlags(M));
Globals.push_back(computeShaderHash(M));
addSignature(M, Globals);
+ addPipelineStateValidationInfo(M, Globals);
appendToCompilerUsed(M, Globals);
return true;
}
@@ -133,6 +136,34 @@ void DXContainerGlobals::addSignature(Module &M,
Globals.emplace_back(buildSignature(M, OutputSig, "dx.osg1", "OSG1"));
}
+void DXContainerGlobals::addPipelineStateValidationInfo(
+ Module &M, SmallVector<GlobalValue *> &Globals) {
+ SmallString<256> Data;
+ raw_svector_ostream OS(Data);
+ PSVRuntimeInfo PSV;
+ Triple TT(M.getTargetTriple());
+ PSV.BaseData.MinimumWaveLaneCount = 0;
+ PSV.BaseData.MaximumWaveLaneCount = std::numeric_limits<uint32_t>::max();
+ PSV.BaseData.ShaderStage =
+ static_cast<uint8_t>(TT.getEnvironment() - Triple::Pixel);
+
+ // Hardcoded values here to unblock loading the shader into D3D.
+ //
+ // TODO: Lots more stuff to do here!
+ //
+ // See issue https://github.com/llvm/llvm-project/issues/96674.
+ PSV.BaseData.NumThreadsX = 1;
+ PSV.BaseData.NumThreadsY = 1;
+ PSV.BaseData.NumThreadsZ = 1;
+ PSV.EntryName = "main";
+
+ PSV.finalize(TT.getEnvironment());
+ PSV.write(OS);
+ Constant *Constant =
+ ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false);
+ Globals.emplace_back(buildContainerGlobal(M, Constant, "dx.psv0", "PSV0"));
+}
+
char DXContainerGlobals::ID = 0;
INITIALIZE_PASS_BEGIN(DXContainerGlobals, "dxil-globals",
"DXContainer Global Emitter", false, true)
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 24a0c8524230..adaaa2a6e0d4 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -269,6 +269,25 @@ def Sin : DXILOpMapping<13, unary, int_sin,
def Tan : DXILOpMapping<14, unary, int_tan,
"Returns tangent(theta) for theta in radians.",
[llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def ACos : DXILOpMapping<15, unary, int_acos,
+ "Returns the arccosine of each component of input.",
+ [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def ASin : DXILOpMapping<16, unary, int_asin,
+ "Returns the arcsine of each component of input.",
+ [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def ATan : DXILOpMapping<17, unary, int_atan,
+ "Returns the arctangent of each component of input.",
+ [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def HCos : DXILOpMapping<18, unary, int_cosh,
+ "Returns the hyperbolic cosine of the specified value.",
+ [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def HSin : DXILOpMapping<19, unary, int_sinh,
+ "Returns the hyperbolic sine of the specified value.",
+ [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def HTan : DXILOpMapping<20, unary, int_tanh,
+ "Returns the hyperbolic tan of the specified value.",
+ [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+
def Exp2 : DXILOpMapping<21, unary, int_exp2,
"Returns the base 2 exponential, or 2**x, of the specified value."
"exp2(x) = 2**x.",
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index c853393e4282..e6dbb25b710e 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -102,8 +102,7 @@ DirectXTargetMachine::DirectXTargetMachine(const Target &T, const Triple &TT,
DirectXTargetMachine::~DirectXTargetMachine() {}
-void DirectXTargetMachine::registerPassBuilderCallbacks(
- PassBuilder &PB, bool PopulateClassToPassNames) {
+void DirectXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "DirectXPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"
}
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.h b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
index 428beaf61cd0..d04c375b2736 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.h
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.h
@@ -47,8 +47,7 @@ public:
}
TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool PopulateClassToPassNames) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 092cccbcca9c..6e5e2a61bd77 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -748,7 +748,8 @@ bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) {
Subsection = HexagonMCExpr::create(
MCConstantExpr::create(8192 + Res, getContext()), getContext());
- getStreamer().subSection(Subsection);
+ getStreamer().switchSection(getStreamer().getCurrentSectionOnly(),
+ Subsection);
return false;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 4c18e076c439..99745941d579 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -219,8 +219,8 @@ namespace {
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -285,7 +285,7 @@ char HexagonBitSimplify::ID = 0;
INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexagon-bit-simplify",
"Hexagon bit simplification", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(HexagonBitSimplify, "hexagon-bit-simplify",
"Hexagon bit simplification", false, false)
@@ -2800,7 +2800,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
auto &HRI = *HST.getRegisterInfo();
auto &HII = *HST.getInstrInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MachineRegisterInfo &MRI = MF.getRegInfo();
bool Changed;
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index f2a02fe9540b..f0933765bbcb 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -218,8 +218,8 @@ namespace {
HexagonConstExtenders() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -569,7 +569,7 @@ namespace {
INITIALIZE_PASS_BEGIN(HexagonConstExtenders, "hexagon-cext-opt",
"Hexagon constant-extender optimization", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(HexagonConstExtenders, "hexagon-cext-opt",
"Hexagon constant-extender optimization", false, false)
@@ -1973,7 +1973,7 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) {
HST = &MF.getSubtarget<HexagonSubtarget>();
HII = HST->getInstrInfo();
HRI = HST->getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MRI = &MF.getRegInfo();
AssignmentMap IMap;
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
index 97917270601b..a5c47e67de89 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
@@ -50,8 +50,8 @@ public:
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index cb820e215899..03f6882e6889 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -162,8 +162,8 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineBranchProbabilityInfo>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -1054,7 +1054,7 @@ bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) {
TRI = ST.getRegisterInfo();
MFN = &MF;
MRI = &MF.getRegInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
MLI = &getAnalysis<MachineLoopInfo>();
MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() :
nullptr;
diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 204f3b6b20c7..8a23b7743e83 100644
--- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -155,8 +155,8 @@ namespace {
AU.addRequired<LiveIntervals>();
AU.addPreserved<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -254,7 +254,7 @@ namespace llvm {
INITIALIZE_PASS_BEGIN(HexagonExpandCondsets, "expand-condsets",
"Hexagon Expand Condsets", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_END(HexagonExpandCondsets, "expand-condsets",
@@ -1277,7 +1277,7 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
HII = static_cast<const HexagonInstrInfo*>(MF.getSubtarget().getInstrInfo());
TRI = MF.getSubtarget().getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
LIS = &getAnalysis<LiveIntervals>();
MRI = &MF.getRegInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 232651132d6e..f4f84beea734 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -413,9 +413,9 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
MachineDominatorTree MDT;
- MDT.runOnMachineFunction(MF);
+ MDT.calculate(MF);
MachinePostDominatorTree MPT;
- MPT.runOnMachineFunction(MF);
+ MPT.recalculate(MF);
using UnsignedMap = DenseMap<unsigned, unsigned>;
using RPOTType = ReversePostOrderTraversal<const MachineFunction *>;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 1e373f6061bb..a4304b053166 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -515,8 +515,8 @@ namespace {
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -1497,7 +1497,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
HRI = ST.getRegisterInfo();
MFN = &MF;
MRI = &MF.getRegInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
// Clean up before any further processing, so that dead code does not
// get used in a newly generated "insert" instruction. Have a custom
@@ -1607,6 +1607,6 @@ FunctionPass *llvm::createHexagonGenInsert() {
INITIALIZE_PASS_BEGIN(HexagonGenInsert, "hexinsert",
"Hexagon generate \"insert\" instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(HexagonGenInsert, "hexinsert",
"Hexagon generate \"insert\" instructions", false, false)
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
index afd49631943f..651ccc2db9ba 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
@@ -56,8 +56,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
MachineFunctionPass::getAnalysisUsage(AU);
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
}
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -82,7 +82,8 @@ bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) {
MRI = &Fn.getRegInfo();
TRI = Fn.getRegInfo().getTargetRegisterInfo();
- MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree &MDT =
+ getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
// Loop over all of the basic blocks
for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 92e743273611..5bb2d7d80ad5 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -93,8 +93,8 @@ namespace {
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -130,7 +130,7 @@ char HexagonGenPredicate::ID = 0;
INITIALIZE_PASS_BEGIN(HexagonGenPredicate, "hexagon-gen-pred",
"Hexagon generate predicate operations", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
"Hexagon generate predicate operations", false, false)
diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 31e37dcce415..19a024078b10 100644
--- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -118,7 +118,7 @@ namespace {
StringRef getPassName() const override { return "Hexagon Hardware Loops"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -368,7 +368,7 @@ namespace {
INITIALIZE_PASS_BEGIN(HexagonHardwareLoops, "hwloops",
"Hexagon Hardware Loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops",
"Hexagon Hardware Loops", false, false)
@@ -386,7 +386,7 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
MLI = &getAnalysis<MachineLoopInfo>();
MRI = &MF.getRegInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
TII = HST.getInstrInfo();
TRI = HST.getRegisterInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 7777ae23e8ae..5a383b23a833 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -770,8 +770,7 @@ bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
// select +++ ? T : 0
Value *U = *SelI->user_begin();
- if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
- !match(U, m_Xor(m_Value(R), m_Specific(SelI))))
+ if (!match(U, m_c_Xor(m_Specific(SelI), m_Value(R))))
return false;
// Matched: xor (select +++ ? 0 : T), R
// xor (select +++ ? T : 0), R
@@ -814,15 +813,13 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
CmpInst::Predicate P;
bool TrueIfZero;
- if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) ||
- match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
+ if (match(CondV, m_c_ICmp(P, m_Value(C), m_Zero()))) {
if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
return false;
// Matched: select C == 0 ? ... : ...
// select C != 0 ? ... : ...
TrueIfZero = (P == CmpInst::ICMP_EQ);
- } else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) ||
- match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
+ } else if (match(CondV, m_c_ICmp(P, m_Value(C), m_One()))) {
if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
return false;
// Matched: select C == 1 ? ... : ...
@@ -832,8 +829,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
return false;
Value *X = nullptr;
- if (!match(C, m_And(m_Value(X), m_One())) &&
- !match(C, m_And(m_One(), m_Value(X))))
+ if (!match(C, m_c_And(m_Value(X), m_One())))
return false;
// Matched: select (X & 1) == +++ ? ... : ...
// select (X & 1) != +++ ? ... : ...
@@ -845,8 +841,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
if (!match(TrueV, m_LShr(m_Value(R), m_One())))
return false;
// Matched: select +++ ? (R >> 1) : ...
- if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
- !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
+ if (!match(FalseV, m_c_Xor(m_Specific(TrueV), m_Value(Q))))
return false;
// Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
// with commuting ^.
@@ -856,8 +851,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
if (!match(FalseV, m_LShr(m_Value(R), m_One())))
return false;
// Matched: select +++ ? ... : (R >> 1)
- if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
- !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
+ if (!match(TrueV, m_c_Xor(m_Specific(FalseV), m_Value(Q))))
return false;
// Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
// with commuting ^.
diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 0e82bf6e5331..e7f5c257b21c 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -70,7 +70,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
MachineFunctionPass::getAnalysisUsage(AU);
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineDominanceFrontier>();
AU.setPreservesAll();
}
@@ -122,7 +122,7 @@ char HexagonOptAddrMode::ID = 0;
INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "amode-opt",
"Optimize addressing mode", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
INITIALIZE_PASS_END(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode",
false, false)
@@ -872,7 +872,7 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
HII = HST.getInstrInfo();
HRI = HST.getRegisterInfo();
const auto &MDF = getAnalysis<MachineDominanceFrontier>();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
DataFlowGraph G(MF, *HII, *HRI, *MDT, MDF);
// Need to keep dead phis because we can propagate uses of registers into
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 4131f2a31755..3c17f6800114 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -63,7 +63,7 @@ namespace {
HexagonRDFOpt() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineDominanceFrontier>();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -109,7 +109,7 @@ char HexagonRDFOpt::ID = 0;
INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "hexagon-rdf-opt",
"Hexagon RDF optimizations", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
INITIALIZE_PASS_END(HexagonRDFOpt, "hexagon-rdf-opt",
"Hexagon RDF optimizations", false, false)
@@ -302,7 +302,7 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
RDFCount++;
}
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
const auto &MDF = getAnalysis<MachineDominanceFrontier>();
const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index da8ab5c4b21b..5e713842ff67 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -55,10 +55,6 @@ static cl::opt<bool>
DisableHexagonMISched("disable-hexagon-misched", cl::Hidden,
cl::desc("Disable Hexagon MI Scheduling"));
-static cl::opt<bool> EnableSubregLiveness(
- "hexagon-subreg-liveness", cl::Hidden, cl::init(true),
- cl::desc("Enable subregister liveness tracking for Hexagon"));
-
static cl::opt<bool> OverrideLongCalls(
"hexagon-long-calls", cl::Hidden,
cl::desc("If present, forces/disables the use of long calls"));
@@ -726,9 +722,7 @@ unsigned HexagonSubtarget::getL1PrefetchDistance() const {
return 32;
}
-bool HexagonSubtarget::enableSubRegLiveness() const {
- return EnableSubregLiveness;
-}
+bool HexagonSubtarget::enableSubRegLiveness() const { return true; }
Intrinsic::ID HexagonSubtarget::getIntrinsicId(unsigned Opc) const {
struct Scalar {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index e4886506de19..b362285d4f16 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -312,8 +312,7 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
return I.get();
}
-void HexagonTargetMachine::registerPassBuilderCallbacks(
- PassBuilder &PB, bool PopulateClassToPassNames) {
+void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "HexagonPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index 34ff45b6acf3..6e9a78b76650 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -35,8 +35,7 @@ public:
~HexagonTargetMachine() override;
const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool PopulateClassToPassNames) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 56472d633694..2d5352b08cae 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -97,9 +97,9 @@ namespace {
AU.setPreservesCFG();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<MachineBranchProbabilityInfo>();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addPreserved<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -124,7 +124,7 @@ char HexagonPacketizer::ID = 0;
INITIALIZE_PASS_BEGIN(HexagonPacketizer, "hexagon-packetizer",
"Hexagon Packetizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index f9a0ba3608e6..54efe4bc25ef 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -712,18 +712,20 @@ public:
void finishLayout(MCAssembler const &Asm,
MCAsmLayout &Layout) const override {
+ SmallVector<MCFragment *> Frags;
for (auto *I : Layout.getSectionOrder()) {
- auto &Fragments = I->getFragmentList();
- for (auto &J : Fragments) {
- switch (J.getKind()) {
+ Frags.clear();
+ for (MCFragment &F : *I)
+ Frags.push_back(&F);
+ for (size_t J = 0, E = Frags.size(); J != E; ++J) {
+ switch (Frags[J]->getKind()) {
default:
break;
case MCFragment::FT_Align: {
- auto Size = Asm.computeFragmentSize(Layout, J);
- for (auto K = J.getIterator();
- K != Fragments.begin() && Size >= HEXAGON_PACKET_SIZE;) {
+ auto Size = Asm.computeFragmentSize(Layout, *Frags[J]);
+ for (auto K = J; K != 0 && Size >= HEXAGON_PACKET_SIZE;) {
--K;
- switch (K->getKind()) {
+ switch (Frags[K]->getKind()) {
default:
break;
case MCFragment::FT_Align: {
@@ -733,7 +735,7 @@ public:
}
case MCFragment::FT_Relaxable: {
MCContext &Context = Asm.getContext();
- auto &RF = cast<MCRelaxableFragment>(*K);
+ auto &RF = cast<MCRelaxableFragment>(*Frags[K]);
auto &Inst = const_cast<MCInst &>(RF.getInst());
while (Size > 0 &&
HexagonMCInstrInfo::bundleSize(Inst) < MaxPacketSize) {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 9d7e4636abac..f0a18b42f481 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -785,6 +785,7 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
+ SDValue Load;
switch (M) {
default:
@@ -796,33 +797,49 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
// This is not actually used, but is necessary for successfully matching
// the PseudoLA_*_LARGE nodes.
SDValue Tmp = DAG.getConstant(0, DL, Ty);
- if (IsLocal)
+ if (IsLocal) {
// This generates the pattern (PseudoLA_PCREL_LARGE tmp sym), that
// eventually becomes the desired 5-insn code sequence.
- return SDValue(DAG.getMachineNode(LoongArch::PseudoLA_PCREL_LARGE, DL, Ty,
+ Load = SDValue(DAG.getMachineNode(LoongArch::PseudoLA_PCREL_LARGE, DL, Ty,
Tmp, Addr),
0);
-
- // This generates the pattern (PseudoLA_GOT_LARGE tmp sym), that eventually
- // becomes the desired 5-insn code sequence.
- return SDValue(
- DAG.getMachineNode(LoongArch::PseudoLA_GOT_LARGE, DL, Ty, Tmp, Addr),
- 0);
+ } else {
+ // This generates the pattern (PseudoLA_GOT_LARGE tmp sym), that
+ // eventually becomes the desired 5-insn code sequence.
+ Load = SDValue(
+ DAG.getMachineNode(LoongArch::PseudoLA_GOT_LARGE, DL, Ty, Tmp, Addr),
+ 0);
+ }
+ break;
}
case CodeModel::Small:
case CodeModel::Medium:
- if (IsLocal)
+ if (IsLocal) {
// This generates the pattern (PseudoLA_PCREL sym), which expands to
// (addi.w/d (pcalau12i %pc_hi20(sym)) %pc_lo12(sym)).
- return SDValue(
+ Load = SDValue(
DAG.getMachineNode(LoongArch::PseudoLA_PCREL, DL, Ty, Addr), 0);
+ } else {
+ // This generates the pattern (PseudoLA_GOT sym), which expands to (ld.w/d
+ // (pcalau12i %got_pc_hi20(sym)) %got_pc_lo12(sym)).
+ Load =
+ SDValue(DAG.getMachineNode(LoongArch::PseudoLA_GOT, DL, Ty, Addr), 0);
+ }
+ }
- // This generates the pattern (PseudoLA_GOT sym), which expands to (ld.w/d
- // (pcalau12i %got_pc_hi20(sym)) %got_pc_lo12(sym)).
- return SDValue(DAG.getMachineNode(LoongArch::PseudoLA_GOT, DL, Ty, Addr),
- 0);
+ if (!IsLocal) {
+ // Mark the load instruction as invariant to enable hoisting in MachineLICM.
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MemOp = MF.getMachineMemOperand(
+ MachinePointerInfo::getGOT(MF),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ LLT(Ty.getSimpleVT()), Align(Ty.getFixedSizeInBits() / 8));
+ DAG.setNodeMemRefs(cast<MachineSDNode>(Load.getNode()), {MemOp});
}
+
+ return Load;
}
SDValue LoongArchTargetLowering::lowerBlockAddress(SDValue Op,
@@ -860,7 +877,7 @@ SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op,
SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
SelectionDAG &DAG,
- unsigned Opc,
+ unsigned Opc, bool UseGOT,
bool Large) const {
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
@@ -873,6 +890,16 @@ SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
SDValue Offset = Large
? SDValue(DAG.getMachineNode(Opc, DL, Ty, Tmp, Addr), 0)
: SDValue(DAG.getMachineNode(Opc, DL, Ty, Addr), 0);
+ if (UseGOT) {
+ // Mark the load instruction as invariant to enable hoisting in MachineLICM.
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MemOp = MF.getMachineMemOperand(
+ MachinePointerInfo::getGOT(MF),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ LLT(Ty.getSimpleVT()), Align(Ty.getFixedSizeInBits() / 8));
+ DAG.setNodeMemRefs(cast<MachineSDNode>(Offset.getNode()), {MemOp});
+ }
// Add the thread pointer.
return DAG.getNode(ISD::ADD, DL, Ty, Offset,
@@ -945,6 +972,10 @@ LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op,
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
assert(N->getOffset() == 0 && "unexpected offset in global node");
+ if (DAG.getTarget().useEmulatedTLS())
+ report_fatal_error("the emulated TLS is prohibited",
+ /*GenCrashDiag=*/false);
+
bool IsDesc = DAG.getTarget().useTLSDESC();
switch (getTargetMachine().getTLSModel(N->getGlobal())) {
@@ -972,13 +1003,14 @@ LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op,
return getStaticTLSAddr(N, DAG,
Large ? LoongArch::PseudoLA_TLS_IE_LARGE
: LoongArch::PseudoLA_TLS_IE,
- Large);
+ /*UseGOT=*/true, Large);
case TLSModel::LocalExec:
// This model is used when static linking as the TLS offsets are resolved
// during program linking.
//
// This node doesn't need an extra argument for the large code model.
- return getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE);
+ return getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE,
+ /*UseGOT=*/false);
}
return getTLSDescAddr(N, DAG,
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 9328831a17a3..f4c57f80fdbe 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -267,7 +267,7 @@ private:
SDValue getAddr(NodeTy *N, SelectionDAG &DAG, CodeModel::Model M,
bool IsLocal = true) const;
SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
- unsigned Opc, bool Large = false) const;
+ unsigned Opc, bool UseGOT, bool Large = false) const;
SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
unsigned Opc, bool Large = false) const;
SDValue getTLSDescAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 6b75634f5b2e..a85b054a85d7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -229,6 +229,21 @@ unsigned LoongArchInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return MI.getDesc().getSize();
}
+bool LoongArchInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
+ const unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ break;
+ case LoongArch::ADDI_D:
+ case LoongArch::ORI:
+ case LoongArch::XORI:
+ return (MI.getOperand(1).isReg() &&
+ MI.getOperand(1).getReg() == LoongArch::R0) ||
+ (MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0);
+ }
+ return MI.isAsCheapAsAMove();
+}
+
MachineBasicBlock *
LoongArchInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
assert(MI.getDesc().isBranch() && "Unexpected opcode!");
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index 3b80f55bc84f..eb19051e380c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -52,6 +52,8 @@ public:
unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+ bool isAsCheapAsAMove(const MachineInstr &MI) const override;
+
MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index f72f46e39e2a..67c3dfd3b259 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -751,7 +751,8 @@ def XOR : ALU_3R<0x00158000>;
def ANDN : ALU_3R<0x00168000>;
def ORN : ALU_3R<0x00160000>;
def ANDI : ALU_2RI12<0x03400000, uimm12>;
-let isReMaterializable = 1 in {
+// See LoongArchInstrInfo::isAsCheapAsAMove for more details.
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def ORI : ALU_2RI12<0x03800000, uimm12_ori>;
def XORI : ALU_2RI12<0x03c00000, uimm12>;
}
@@ -858,7 +859,8 @@ def ADD_D : ALU_3R<0x00108000>;
def SUB_D : ALU_3R<0x00118000>;
// ADDI_D isn't always rematerializable, but isReMaterializable will be used as
// a hint which is verified in isReallyTriviallyReMaterializable.
-let isReMaterializable = 1 in {
+// See LoongArchInstrInfo::isAsCheapAsAMove for more details.
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def ADDI_D : ALU_2RI12<0x02c00000, simm12_addlike>;
}
def ADDU16I_D : ALU_2RI16<0x10000000, simm16>;
@@ -1580,13 +1582,26 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst),
"la.abs", "$dst, $src">;
def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la.pcrel", "$dst, $src">;
-let Defs = [R20], Size = 20 in
+def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+ "la.tls.ld", "$dst, $src">;
+def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+ "la.tls.gd", "$dst, $src">;
+let Defs = [R20], Size = 20 in {
def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
(ins GPR:$tmp, bare_symbol:$src), [],
"la.pcrel", "$dst, $tmp, $src">,
Requires<[IsLA64]>;
def PseudoLA_TLS_LE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la.tls.le", "$dst, $src">;
+def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
+ (ins GPR:$tmp, bare_symbol:$src), [],
+ "la.tls.ld", "$dst, $tmp, $src">,
+ Requires<[IsLA64]>;
+def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
+ (ins GPR:$tmp, bare_symbol:$src), [],
+ "la.tls.gd", "$dst, $tmp, $src">,
+ Requires<[IsLA64]>;
+} // Defs = [R20], Size = 20
}
let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
isAsmParserOnly = 1 in {
@@ -1594,10 +1609,6 @@ def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la.got", "$dst, $src">;
def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la.tls.ie", "$dst, $src">;
-def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
- "la.tls.ld", "$dst, $src">;
-def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
- "la.tls.gd", "$dst, $src">;
let Defs = [R20], Size = 20 in {
def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
(ins GPR:$tmp, bare_symbol:$src), [],
@@ -1607,14 +1618,6 @@ def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
(ins GPR:$tmp, bare_symbol:$src), [],
"la.tls.ie", "$dst, $tmp, $src">,
Requires<[IsLA64]>;
-def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
- (ins GPR:$tmp, bare_symbol:$src), [],
- "la.tls.ld", "$dst, $tmp, $src">,
- Requires<[IsLA64]>;
-def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
- (ins GPR:$tmp, bare_symbol:$src), [],
- "la.tls.gd", "$dst, $tmp, $src">,
- Requires<[IsLA64]>;
} // Defs = [R20], Size = 20
}
@@ -2041,60 +2044,60 @@ multiclass ternary_atomic_op_failure_ord {
}]>;
}
-defm atomic_cmp_swap_32 : ternary_atomic_op_failure_ord;
-defm atomic_cmp_swap_64 : ternary_atomic_op_failure_ord;
+defm atomic_cmp_swap_i32 : ternary_atomic_op_failure_ord;
+defm atomic_cmp_swap_i64 : ternary_atomic_op_failure_ord;
let Predicates = [IsLA64] in {
def : AtomicPat<int_loongarch_masked_atomicrmw_xchg_i64,
PseudoMaskedAtomicSwap32>;
-def : Pat<(atomic_swap_32 GPR:$addr, GPR:$incr),
+def : Pat<(atomic_swap_i32 GPR:$addr, GPR:$incr),
(AMSWAP__DB_W GPR:$incr, GPR:$addr)>;
-def : Pat<(atomic_swap_64 GPR:$addr, GPR:$incr),
+def : Pat<(atomic_swap_i64 GPR:$addr, GPR:$incr),
(AMSWAP__DB_D GPR:$incr, GPR:$addr)>;
-def : Pat<(atomic_load_add_64 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_add_i64 GPR:$rj, GPR:$rk),
(AMADD__DB_D GPR:$rk, GPR:$rj)>;
def : AtomicPat<int_loongarch_masked_atomicrmw_add_i64,
PseudoMaskedAtomicLoadAdd32>;
-def : Pat<(atomic_load_sub_32 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_sub_i32 GPR:$rj, GPR:$rk),
(AMADD__DB_W (SUB_W R0, GPR:$rk), GPR:$rj)>;
-def : Pat<(atomic_load_sub_64 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_sub_i64 GPR:$rj, GPR:$rk),
(AMADD__DB_D (SUB_D R0, GPR:$rk), GPR:$rj)>;
def : AtomicPat<int_loongarch_masked_atomicrmw_sub_i64,
PseudoMaskedAtomicLoadSub32>;
-defm : PseudoBinPat<"atomic_load_nand_64", PseudoAtomicLoadNand64>;
+defm : PseudoBinPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64>;
def : AtomicPat<int_loongarch_masked_atomicrmw_nand_i64,
PseudoMaskedAtomicLoadNand32>;
-def : Pat<(atomic_load_add_32 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_add_i32 GPR:$rj, GPR:$rk),
(AMADD__DB_W GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_and_32 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_and_i32 GPR:$rj, GPR:$rk),
(AMAND__DB_W GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_and_64 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_and_i64 GPR:$rj, GPR:$rk),
(AMAND__DB_D GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_or_32 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_or_i32 GPR:$rj, GPR:$rk),
(AMOR__DB_W GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_or_64 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_or_i64 GPR:$rj, GPR:$rk),
(AMOR__DB_D GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_xor_32 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_xor_i32 GPR:$rj, GPR:$rk),
(AMXOR__DB_W GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_xor_64 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_xor_i64 GPR:$rj, GPR:$rk),
(AMXOR__DB_D GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_umin_32 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_umin_i32 GPR:$rj, GPR:$rk),
(AMMIN__DB_WU GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_umin_64 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_umin_i64 GPR:$rj, GPR:$rk),
(AMMIN__DB_DU GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_umax_32 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_umax_i32 GPR:$rj, GPR:$rk),
(AMMAX__DB_WU GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_umax_64 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_umax_i64 GPR:$rj, GPR:$rk),
(AMMAX__DB_DU GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_min_32 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_min_i32 GPR:$rj, GPR:$rk),
(AMMIN__DB_W GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_min_64 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_min_i64 GPR:$rj, GPR:$rk),
(AMMIN__DB_D GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_max_32 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_max_i32 GPR:$rj, GPR:$rk),
(AMMAX__DB_W GPR:$rk, GPR:$rj)>;
-def : Pat<(atomic_load_max_64 GPR:$rj, GPR:$rk),
+def : Pat<(atomic_load_max_i64 GPR:$rj, GPR:$rk),
(AMMAX__DB_D GPR:$rk, GPR:$rj)>;
def : AtomicPat<int_loongarch_masked_atomicrmw_umax_i64,
@@ -2118,8 +2121,8 @@ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst,
(CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>;
}
-defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>;
-defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64, i64>;
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32>;
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_i64", PseudoCmpXchg64, i64>;
def : Pat<(int_loongarch_masked_cmpxchg_i64
GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$fail_order),
(PseudoMaskedCmpXchg32
@@ -2131,23 +2134,23 @@ def : PseudoMaskedAMMinMaxPat<int_loongarch_masked_atomicrmw_min_i64,
PseudoMaskedAtomicLoadMin32>;
} // Predicates = [IsLA64]
-defm : PseudoBinPat<"atomic_load_nand_32", PseudoAtomicLoadNand32>;
+defm : PseudoBinPat<"atomic_load_nand_i32", PseudoAtomicLoadNand32>;
let Predicates = [IsLA32] in {
def : AtomicPat<int_loongarch_masked_atomicrmw_xchg_i32,
PseudoMaskedAtomicSwap32>;
-defm : PseudoBinPat<"atomic_swap_32", PseudoAtomicSwap32>;
+defm : PseudoBinPat<"atomic_swap_i32", PseudoAtomicSwap32>;
def : AtomicPat<int_loongarch_masked_atomicrmw_add_i32,
PseudoMaskedAtomicLoadAdd32>;
def : AtomicPat<int_loongarch_masked_atomicrmw_sub_i32,
PseudoMaskedAtomicLoadSub32>;
def : AtomicPat<int_loongarch_masked_atomicrmw_nand_i32,
PseudoMaskedAtomicLoadNand32>;
-defm : PseudoBinPat<"atomic_load_add_32", PseudoAtomicLoadAdd32>;
-defm : PseudoBinPat<"atomic_load_sub_32", PseudoAtomicLoadSub32>;
-defm : PseudoBinPat<"atomic_load_and_32", PseudoAtomicLoadAnd32>;
-defm : PseudoBinPat<"atomic_load_or_32", PseudoAtomicLoadOr32>;
-defm : PseudoBinPat<"atomic_load_xor_32", PseudoAtomicLoadXor32>;
+defm : PseudoBinPat<"atomic_load_add_i32", PseudoAtomicLoadAdd32>;
+defm : PseudoBinPat<"atomic_load_sub_i32", PseudoAtomicLoadSub32>;
+defm : PseudoBinPat<"atomic_load_and_i32", PseudoAtomicLoadAnd32>;
+defm : PseudoBinPat<"atomic_load_or_i32", PseudoAtomicLoadOr32>;
+defm : PseudoBinPat<"atomic_load_xor_i32", PseudoAtomicLoadXor32>;
} // Predicates = [IsLA32]
/// Intrinsics
diff --git a/llvm/lib/Target/M68k/M68kInstrAtomics.td b/llvm/lib/Target/M68k/M68kInstrAtomics.td
index 40c6593e2cfa..84a662533542 100644
--- a/llvm/lib/Target/M68k/M68kInstrAtomics.td
+++ b/llvm/lib/Target/M68k/M68kInstrAtomics.td
@@ -35,7 +35,7 @@ def CAS32 : MxCASOp<0x3, MxType32d>;
foreach size = [8, 16, 32] in {
- def : Pat<(!cast<SDPatternOperator>("atomic_cmp_swap_"#size) MxCP_ARI:$ptr,
+ def : Pat<(!cast<SDPatternOperator>("atomic_cmp_swap_i"#size) MxCP_ARI:$ptr,
!cast<MxRegOp>("MxDRD"#size):$cmp,
!cast<MxRegOp>("MxDRD"#size):$new),
(!cast<MxInst>("CAS"#size) !cast<MxRegOp>("MxDRD"#size):$cmp,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index e907e8d8a700..f861268c0015 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -90,8 +90,7 @@ void MipsELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
Labels.push_back(Symbol);
}
-void MipsELFStreamer::switchSection(MCSection *Section,
- const MCExpr *Subsection) {
+void MipsELFStreamer::switchSection(MCSection *Section, uint32_t Subsection) {
MCELFStreamer::switchSection(Section, Subsection);
Labels.clear();
}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index 051806d2cfe8..1e8042e88c9e 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -50,8 +50,7 @@ public:
/// Overriding this function allows us to dismiss all labels that are
/// candidates for marking as microMIPS when .section directive is processed.
- void switchSection(MCSection *Section,
- const MCExpr *Subsection = nullptr) override;
+ void switchSection(MCSection *Section, uint32_t Subsection = 0) override;
/// Overriding these functions allows us to dismiss all labels that are
/// candidates for marking as microMIPS when .word/.long/.4byte etc
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index f1aa90d24023..6b013de27477 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -20,7 +20,6 @@
using namespace llvm;
void MipsRegInfoRecord::EmitMipsOptionRecord() {
- MCAssembler &MCA = Streamer->getAssembler();
MipsTargetStreamer *MTS =
static_cast<MipsTargetStreamer *>(Streamer->getTargetStreamer());
@@ -36,7 +35,6 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
MCSectionELF *Sec =
Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1);
- MCA.registerSection(*Sec);
Sec->setAlignment(Align(8));
Streamer->switchSection(Sec);
@@ -54,7 +52,6 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
} else {
MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO,
ELF::SHF_ALLOC, 24);
- MCA.registerSection(*Sec);
Sec->setAlignment(MTS->getABI().IsN32() ? Align(8) : Align(4));
Streamer->switchSection(Sec);
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index bdf299ae8ac1..2f39d091d86d 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -890,14 +890,15 @@ void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
void MipsTargetELFStreamer::finish() {
MCAssembler &MCA = getStreamer().getAssembler();
const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo();
+ MCELFStreamer &S = getStreamer();
// .bss, .text and .data are always at least 16-byte aligned.
MCSection &TextSection = *OFI.getTextSection();
- MCA.registerSection(TextSection);
+ S.changeSection(&TextSection);
MCSection &DataSection = *OFI.getDataSection();
- MCA.registerSection(DataSection);
+ S.changeSection(&DataSection);
MCSection &BSSSection = *OFI.getBSSSection();
- MCA.registerSection(BSSSection);
+ S.changeSection(&BSSSection);
TextSection.ensureMinAlignment(Align(16));
DataSection.ensureMinAlignment(Align(16));
@@ -908,16 +909,15 @@ void MipsTargetELFStreamer::finish() {
// verifying the output of IAS against the output of other assemblers but
// it's not necessary to produce a correct object and increases section
// size.
- MCStreamer &OS = getStreamer();
- for (MCSection &S : MCA) {
- MCSectionELF &Section = static_cast<MCSectionELF &>(S);
+ for (MCSection &Sec : MCA) {
+ MCSectionELF &Section = static_cast<MCSectionELF &>(Sec);
Align Alignment = Section.getAlign();
- OS.switchSection(&Section);
+ S.switchSection(&Section);
if (Section.useCodeAlign())
- OS.emitCodeAlignment(Alignment, &STI, Alignment.value());
+ S.emitCodeAlignment(Alignment, &STI, Alignment.value());
else
- OS.emitValueToAlignment(Alignment, 0, 1, Alignment.value());
+ S.emitValueToAlignment(Alignment, 0, 1, Alignment.value());
}
}
@@ -1015,19 +1015,15 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
MCContext &Context = MCA.getContext();
MCStreamer &OS = getStreamer();
+ OS.pushSection();
MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS, 0);
+ OS.switchSection(Sec);
+ Sec->setAlignment(Align(4));
MCSymbol *Sym = Context.getOrCreateSymbol(Name);
const MCSymbolRefExpr *ExprRef =
MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context);
- MCA.registerSection(*Sec);
- Sec->setAlignment(Align(4));
-
- OS.pushSection();
-
- OS.switchSection(Sec);
-
OS.emitValueImpl(ExprRef, 4);
OS.emitIntValue(GPRInfoSet ? GPRBitMask : 0, 4); // reg_mask
@@ -1306,9 +1302,8 @@ void MipsTargetELFStreamer::emitMipsAbiFlags() {
MCStreamer &OS = getStreamer();
MCSectionELF *Sec = Context.getELFSection(
".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24);
- MCA.registerSection(*Sec);
- Sec->setAlignment(Align(8));
OS.switchSection(Sec);
+ Sec->setAlignment(Align(8));
OS << ABIFlagsSection;
}
diff --git a/llvm/lib/Target/Mips/Mips64InstrInfo.td b/llvm/lib/Target/Mips/Mips64InstrInfo.td
index c0e7eef8dd9d..f6ac3091a3ba 100644
--- a/llvm/lib/Target/Mips/Mips64InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -75,18 +75,18 @@ def assertzext_lt_i32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
// Instructions specific format
//===----------------------------------------------------------------------===//
let usesCustomInserter = 1 in {
- def ATOMIC_LOAD_ADD_I64 : Atomic2Ops<atomic_load_add_64, GPR64>;
- def ATOMIC_LOAD_SUB_I64 : Atomic2Ops<atomic_load_sub_64, GPR64>;
- def ATOMIC_LOAD_AND_I64 : Atomic2Ops<atomic_load_and_64, GPR64>;
- def ATOMIC_LOAD_OR_I64 : Atomic2Ops<atomic_load_or_64, GPR64>;
- def ATOMIC_LOAD_XOR_I64 : Atomic2Ops<atomic_load_xor_64, GPR64>;
- def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_64, GPR64>;
- def ATOMIC_SWAP_I64 : Atomic2Ops<atomic_swap_64, GPR64>;
- def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
- def ATOMIC_LOAD_MIN_I64 : Atomic2Ops<atomic_load_min_64, GPR64>;
- def ATOMIC_LOAD_MAX_I64 : Atomic2Ops<atomic_load_max_64, GPR64>;
- def ATOMIC_LOAD_UMIN_I64 : Atomic2Ops<atomic_load_umin_64, GPR64>;
- def ATOMIC_LOAD_UMAX_I64 : Atomic2Ops<atomic_load_umax_64, GPR64>;
+ def ATOMIC_LOAD_ADD_I64 : Atomic2Ops<atomic_load_add_i64, GPR64>;
+ def ATOMIC_LOAD_SUB_I64 : Atomic2Ops<atomic_load_sub_i64, GPR64>;
+ def ATOMIC_LOAD_AND_I64 : Atomic2Ops<atomic_load_and_i64, GPR64>;
+ def ATOMIC_LOAD_OR_I64 : Atomic2Ops<atomic_load_or_i64, GPR64>;
+ def ATOMIC_LOAD_XOR_I64 : Atomic2Ops<atomic_load_xor_i64, GPR64>;
+ def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_i64, GPR64>;
+ def ATOMIC_SWAP_I64 : Atomic2Ops<atomic_swap_i64, GPR64>;
+ def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<atomic_cmp_swap_i64, GPR64>;
+ def ATOMIC_LOAD_MIN_I64 : Atomic2Ops<atomic_load_min_i64, GPR64>;
+ def ATOMIC_LOAD_MAX_I64 : Atomic2Ops<atomic_load_max_i64, GPR64>;
+ def ATOMIC_LOAD_UMIN_I64 : Atomic2Ops<atomic_load_umin_i64, GPR64>;
+ def ATOMIC_LOAD_UMAX_I64 : Atomic2Ops<atomic_load_umax_i64, GPR64>;
}
def ATOMIC_LOAD_ADD_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index dda33f9a1808..62dfa5f71106 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1014,7 +1014,7 @@ void MipsAsmPrinter::EmitFPCallStub(
MCSectionELF *M = OutContext.getELFSection(
".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
- OutStreamer->switchSection(M, nullptr);
+ OutStreamer->switchSection(M);
//
// .align 2
//
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index 23e04c442bf6..85e3e78d2a4d 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -1904,45 +1904,45 @@ def ADJCALLSTACKUP : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
}
let usesCustomInserter = 1 in {
- def ATOMIC_LOAD_ADD_I8 : Atomic2Ops<atomic_load_add_8, GPR32>;
- def ATOMIC_LOAD_ADD_I16 : Atomic2Ops<atomic_load_add_16, GPR32>;
- def ATOMIC_LOAD_ADD_I32 : Atomic2Ops<atomic_load_add_32, GPR32>;
- def ATOMIC_LOAD_SUB_I8 : Atomic2Ops<atomic_load_sub_8, GPR32>;
- def ATOMIC_LOAD_SUB_I16 : Atomic2Ops<atomic_load_sub_16, GPR32>;
- def ATOMIC_LOAD_SUB_I32 : Atomic2Ops<atomic_load_sub_32, GPR32>;
- def ATOMIC_LOAD_AND_I8 : Atomic2Ops<atomic_load_and_8, GPR32>;
- def ATOMIC_LOAD_AND_I16 : Atomic2Ops<atomic_load_and_16, GPR32>;
- def ATOMIC_LOAD_AND_I32 : Atomic2Ops<atomic_load_and_32, GPR32>;
- def ATOMIC_LOAD_OR_I8 : Atomic2Ops<atomic_load_or_8, GPR32>;
- def ATOMIC_LOAD_OR_I16 : Atomic2Ops<atomic_load_or_16, GPR32>;
- def ATOMIC_LOAD_OR_I32 : Atomic2Ops<atomic_load_or_32, GPR32>;
- def ATOMIC_LOAD_XOR_I8 : Atomic2Ops<atomic_load_xor_8, GPR32>;
- def ATOMIC_LOAD_XOR_I16 : Atomic2Ops<atomic_load_xor_16, GPR32>;
- def ATOMIC_LOAD_XOR_I32 : Atomic2Ops<atomic_load_xor_32, GPR32>;
- def ATOMIC_LOAD_NAND_I8 : Atomic2Ops<atomic_load_nand_8, GPR32>;
- def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, GPR32>;
- def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, GPR32>;
-
- def ATOMIC_SWAP_I8 : Atomic2Ops<atomic_swap_8, GPR32>;
- def ATOMIC_SWAP_I16 : Atomic2Ops<atomic_swap_16, GPR32>;
- def ATOMIC_SWAP_I32 : Atomic2Ops<atomic_swap_32, GPR32>;
-
- def ATOMIC_CMP_SWAP_I8 : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
- def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
- def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
-
- def ATOMIC_LOAD_MIN_I8 : Atomic2Ops<atomic_load_min_8, GPR32>;
- def ATOMIC_LOAD_MIN_I16 : Atomic2Ops<atomic_load_min_16, GPR32>;
- def ATOMIC_LOAD_MIN_I32 : Atomic2Ops<atomic_load_min_32, GPR32>;
- def ATOMIC_LOAD_MAX_I8 : Atomic2Ops<atomic_load_max_8, GPR32>;
- def ATOMIC_LOAD_MAX_I16 : Atomic2Ops<atomic_load_max_16, GPR32>;
- def ATOMIC_LOAD_MAX_I32 : Atomic2Ops<atomic_load_max_32, GPR32>;
- def ATOMIC_LOAD_UMIN_I8 : Atomic2Ops<atomic_load_umin_8, GPR32>;
- def ATOMIC_LOAD_UMIN_I16 : Atomic2Ops<atomic_load_umin_16, GPR32>;
- def ATOMIC_LOAD_UMIN_I32 : Atomic2Ops<atomic_load_umin_32, GPR32>;
- def ATOMIC_LOAD_UMAX_I8 : Atomic2Ops<atomic_load_umax_8, GPR32>;
- def ATOMIC_LOAD_UMAX_I16 : Atomic2Ops<atomic_load_umax_16, GPR32>;
- def ATOMIC_LOAD_UMAX_I32 : Atomic2Ops<atomic_load_umax_32, GPR32>;
+ def ATOMIC_LOAD_ADD_I8 : Atomic2Ops<atomic_load_add_i8, GPR32>;
+ def ATOMIC_LOAD_ADD_I16 : Atomic2Ops<atomic_load_add_i16, GPR32>;
+ def ATOMIC_LOAD_ADD_I32 : Atomic2Ops<atomic_load_add_i32, GPR32>;
+ def ATOMIC_LOAD_SUB_I8 : Atomic2Ops<atomic_load_sub_i8, GPR32>;
+ def ATOMIC_LOAD_SUB_I16 : Atomic2Ops<atomic_load_sub_i16, GPR32>;
+ def ATOMIC_LOAD_SUB_I32 : Atomic2Ops<atomic_load_sub_i32, GPR32>;
+ def ATOMIC_LOAD_AND_I8 : Atomic2Ops<atomic_load_and_i8, GPR32>;
+ def ATOMIC_LOAD_AND_I16 : Atomic2Ops<atomic_load_and_i16, GPR32>;
+ def ATOMIC_LOAD_AND_I32 : Atomic2Ops<atomic_load_and_i32, GPR32>;
+ def ATOMIC_LOAD_OR_I8 : Atomic2Ops<atomic_load_or_i8, GPR32>;
+ def ATOMIC_LOAD_OR_I16 : Atomic2Ops<atomic_load_or_i16, GPR32>;
+ def ATOMIC_LOAD_OR_I32 : Atomic2Ops<atomic_load_or_i32, GPR32>;
+ def ATOMIC_LOAD_XOR_I8 : Atomic2Ops<atomic_load_xor_i8, GPR32>;
+ def ATOMIC_LOAD_XOR_I16 : Atomic2Ops<atomic_load_xor_i16, GPR32>;
+ def ATOMIC_LOAD_XOR_I32 : Atomic2Ops<atomic_load_xor_i32, GPR32>;
+ def ATOMIC_LOAD_NAND_I8 : Atomic2Ops<atomic_load_nand_i8, GPR32>;
+ def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_i16, GPR32>;
+ def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_i32, GPR32>;
+
+ def ATOMIC_SWAP_I8 : Atomic2Ops<atomic_swap_i8, GPR32>;
+ def ATOMIC_SWAP_I16 : Atomic2Ops<atomic_swap_i16, GPR32>;
+ def ATOMIC_SWAP_I32 : Atomic2Ops<atomic_swap_i32, GPR32>;
+
+ def ATOMIC_CMP_SWAP_I8 : AtomicCmpSwap<atomic_cmp_swap_i8, GPR32>;
+ def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<atomic_cmp_swap_i16, GPR32>;
+ def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<atomic_cmp_swap_i32, GPR32>;
+
+ def ATOMIC_LOAD_MIN_I8 : Atomic2Ops<atomic_load_min_i8, GPR32>;
+ def ATOMIC_LOAD_MIN_I16 : Atomic2Ops<atomic_load_min_i16, GPR32>;
+ def ATOMIC_LOAD_MIN_I32 : Atomic2Ops<atomic_load_min_i32, GPR32>;
+ def ATOMIC_LOAD_MAX_I8 : Atomic2Ops<atomic_load_max_i8, GPR32>;
+ def ATOMIC_LOAD_MAX_I16 : Atomic2Ops<atomic_load_max_i16, GPR32>;
+ def ATOMIC_LOAD_MAX_I32 : Atomic2Ops<atomic_load_max_i32, GPR32>;
+ def ATOMIC_LOAD_UMIN_I8 : Atomic2Ops<atomic_load_umin_i8, GPR32>;
+ def ATOMIC_LOAD_UMIN_I16 : Atomic2Ops<atomic_load_umin_i16, GPR32>;
+ def ATOMIC_LOAD_UMIN_I32 : Atomic2Ops<atomic_load_umin_i32, GPR32>;
+ def ATOMIC_LOAD_UMAX_I8 : Atomic2Ops<atomic_load_umax_i8, GPR32>;
+ def ATOMIC_LOAD_UMAX_I16 : Atomic2Ops<atomic_load_umax_i16, GPR32>;
+ def ATOMIC_LOAD_UMAX_I32 : Atomic2Ops<atomic_load_umax_i32, GPR32>;
}
def ATOMIC_LOAD_ADD_I8_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
diff --git a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
index 902c7ceb869a..7ceb97642bba 100644
--- a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -83,7 +83,7 @@ public:
bool runOnMachineFunction(MachineFunction &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -197,7 +197,8 @@ bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) {
return false;
// Do a pre-order traversal of the dominator tree.
- MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree *MDT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
bool Changed = false;
SmallVector<MBBInfo, 8> WorkList(1, MBBInfo(MDT->getRootNode()));
diff --git a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
index 0578655f0443..bd8a065011c9 100644
--- a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
@@ -110,8 +110,8 @@ void MipsPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
}
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -139,7 +139,8 @@ bool MipsPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ IsOptNone ? nullptr
+ : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
MipsPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr,
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index 1aff608cfffb..fc207b1a8871 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -46,8 +46,7 @@ static bool isDwarfSection(const MCObjectFileInfo *FI,
const MCSection *Section) {
// FIXME: the checks for the DWARF sections are very fragile and should be
// fixed up in a followup patch.
- if (!Section || Section->getKind().isText() ||
- Section->getKind().isWriteable())
+ if (!Section || Section->isText())
return false;
return Section == FI->getDwarfAbbrevSection() ||
Section == FI->getDwarfInfoSection() ||
@@ -84,8 +83,7 @@ static bool isDwarfSection(const MCObjectFileInfo *FI,
}
void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
- MCSection *Section,
- const MCExpr *SubSection,
+ MCSection *Section, uint32_t SubSection,
raw_ostream &OS) {
assert(!SubSection && "SubSection is not null!");
const MCObjectFileInfo *FI = getStreamer().getContext().getObjectFileInfo();
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
index b0d8978ee685..ca0d84ee2079 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
@@ -43,7 +43,7 @@ public:
/// functions.
void emitDwarfFileDirective(StringRef Directive) override;
void changeSection(const MCSection *CurSection, MCSection *Section,
- const MCExpr *SubSection, raw_ostream &OS) override;
+ uint32_t SubSection, raw_ostream &OS) override;
/// Emit the bytes in \p Data into the output.
///
/// This is used to emit bytes in \p Data as sequence of .byte directives.
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 82770f866085..ca077d41d36b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1845,6 +1845,10 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
auto AddIntToBuffer = [AggBuffer, Bytes](const APInt &Val) {
size_t NumBytes = (Val.getBitWidth() + 7) / 8;
SmallVector<unsigned char, 16> Buf(NumBytes);
+ // `extractBitsAsZExtValue` does not allow the extraction of bits beyond the
+ // input's bit width, and i1 arrays may not have a length that is a multuple
+ // of 8. We handle the last byte separately, so we never request out of
+ // bounds bits.
for (unsigned I = 0; I < NumBytes - 1; ++I) {
Buf[I] = Val.extractBitsAsZExtValue(8, I * 8);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f4ef7c9914f1..8cb83d8322b8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3232,9 +3232,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
if (NumElts != 1)
return std::nullopt;
Align PartAlign =
- (Offsets[parti] == 0 && PAL.getParamAlignment(i))
- ? PAL.getParamAlignment(i).value()
- : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
+ DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
return commonAlignment(PartAlign, Offsets[parti]);
}();
SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
@@ -5038,7 +5036,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
/// ensures that alignment is 16 or greater.
Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
const Function *F, Type *ArgTy, const DataLayout &DL) const {
- const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
+ // Capping the alignment to 128 bytes as that is the maximum alignment
+ // supported by PTX.
+ const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
// If a function has linkage different from internal or private, we
// must use default ABI alignment as external users rely on it. Same
@@ -5048,10 +5048,10 @@ Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
/*IgnoreCallbackUses=*/false,
/*IgnoreAssumeLikeCalls=*/true,
/*IgnoreLLVMUsed=*/true))
- return Align(ABITypeAlign);
+ return ABITypeAlign;
assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
- return Align(std::max(uint64_t(16), ABITypeAlign));
+ return std::max(Align(16), ABITypeAlign);
}
/// Helper for computing alignment of a device function byval parameter.
@@ -5215,103 +5215,131 @@ bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
}
+static bool isConstZero(const SDValue &Operand) {
+ const auto *Const = dyn_cast<ConstantSDNode>(Operand);
+ return Const && Const->getZExtValue() == 0;
+}
+
/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
/// operands N0 and N1. This is a helper for PerformADDCombine that is
/// called with the default operands, and if that fails, with commuted
/// operands.
-static SDValue PerformADDCombineWithOperands(
- SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI,
- const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) {
- SelectionDAG &DAG = DCI.DAG;
- // Skip non-integer, non-scalar case
- EVT VT=N0.getValueType();
- if (VT.isVector())
+static SDValue
+PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N0.getValueType();
+
+ // Since integer multiply-add costs the same as integer multiply
+ // but is more costly than integer add, do the fusion only when
+ // the mul is only used in the add.
+ // TODO: this may not be true for later architectures, consider relaxing this
+ if (!N0.getNode()->hasOneUse())
return SDValue();
// fold (add (mul a, b), c) -> (mad a, b, c)
//
- if (N0.getOpcode() == ISD::MUL) {
- assert (VT.isInteger());
- // For integer:
- // Since integer multiply-add costs the same as integer multiply
- // but is more costly than integer add, do the fusion only when
- // the mul is only used in the add.
- if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 ||
- !N0.getNode()->hasOneUse())
+ if (N0.getOpcode() == ISD::MUL)
+ return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),
+ N0.getOperand(1), N1);
+
+ // fold (add (select cond, 0, (mul a, b)), c)
+ // -> (select cond, c, (mad a, b, c))
+ //
+ if (N0.getOpcode() == ISD::SELECT) {
+ unsigned ZeroOpNum;
+ if (isConstZero(N0->getOperand(1)))
+ ZeroOpNum = 1;
+ else if (isConstZero(N0->getOperand(2)))
+ ZeroOpNum = 2;
+ else
+ return SDValue();
+
+ SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
+ if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
return SDValue();
- // Do the folding
- return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
- N0.getOperand(0), N0.getOperand(1), N1);
+ SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+ M->getOperand(0), M->getOperand(1), N1);
+ return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
+ ((ZeroOpNum == 1) ? N1 : MAD),
+ ((ZeroOpNum == 1) ? MAD : N1));
}
- else if (N0.getOpcode() == ISD::FMUL) {
- if (VT == MVT::f32 || VT == MVT::f64) {
- const auto *TLI = static_cast<const NVPTXTargetLowering *>(
- &DAG.getTargetLoweringInfo());
- if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
- return SDValue();
- // For floating point:
- // Do the fusion only when the mul has less than 5 uses and all
- // are add.
- // The heuristic is that if a use is not an add, then that use
- // cannot be fused into fma, therefore mul is still needed anyway.
- // If there are more than 4 uses, even if they are all add, fusing
- // them will increase register pressue.
- //
- int numUses = 0;
- int nonAddCount = 0;
- for (const SDNode *User : N0.getNode()->uses()) {
- numUses++;
- if (User->getOpcode() != ISD::FADD)
- ++nonAddCount;
- }
+ return SDValue();
+}
+
+static SDValue
+PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOptLevel OptLevel) {
+ EVT VT = N0.getValueType();
+ if (N0.getOpcode() == ISD::FMUL) {
+ const auto *TLI = static_cast<const NVPTXTargetLowering *>(
+ &DCI.DAG.getTargetLoweringInfo());
+ if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
+ return SDValue();
+
+ // For floating point:
+ // Do the fusion only when the mul has less than 5 uses and all
+ // are add.
+ // The heuristic is that if a use is not an add, then that use
+ // cannot be fused into fma, therefore mul is still needed anyway.
+ // If there are more than 4 uses, even if they are all add, fusing
+ // them will increase register pressue.
+ //
+ int numUses = 0;
+ int nonAddCount = 0;
+ for (const SDNode *User : N0.getNode()->uses()) {
+ numUses++;
+ if (User->getOpcode() != ISD::FADD)
+ ++nonAddCount;
if (numUses >= 5)
return SDValue();
- if (nonAddCount) {
- int orderNo = N->getIROrder();
- int orderNo2 = N0.getNode()->getIROrder();
- // simple heuristics here for considering potential register
- // pressure, the logics here is that the differnce are used
- // to measure the distance between def and use, the longer distance
- // more likely cause register pressure.
- if (orderNo - orderNo2 < 500)
- return SDValue();
-
- // Now, check if at least one of the FMUL's operands is live beyond the node N,
- // which guarantees that the FMA will not increase register pressure at node N.
- bool opIsLive = false;
- const SDNode *left = N0.getOperand(0).getNode();
- const SDNode *right = N0.getOperand(1).getNode();
-
- if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
- opIsLive = true;
-
- if (!opIsLive)
- for (const SDNode *User : left->uses()) {
- int orderNo3 = User->getIROrder();
- if (orderNo3 > orderNo) {
- opIsLive = true;
- break;
- }
- }
+ }
+ if (nonAddCount) {
+ int orderNo = N->getIROrder();
+ int orderNo2 = N0.getNode()->getIROrder();
+ // simple heuristics here for considering potential register
+ // pressure, the logics here is that the differnce are used
+ // to measure the distance between def and use, the longer distance
+ // more likely cause register pressure.
+ if (orderNo - orderNo2 < 500)
+ return SDValue();
- if (!opIsLive)
- for (const SDNode *User : right->uses()) {
- int orderNo3 = User->getIROrder();
- if (orderNo3 > orderNo) {
- opIsLive = true;
- break;
- }
+ // Now, check if at least one of the FMUL's operands is live beyond the
+ // node N, which guarantees that the FMA will not increase register
+ // pressure at node N.
+ bool opIsLive = false;
+ const SDNode *left = N0.getOperand(0).getNode();
+ const SDNode *right = N0.getOperand(1).getNode();
+
+ if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
+ opIsLive = true;
+
+ if (!opIsLive)
+ for (const SDNode *User : left->uses()) {
+ int orderNo3 = User->getIROrder();
+ if (orderNo3 > orderNo) {
+ opIsLive = true;
+ break;
}
+ }
- if (!opIsLive)
- return SDValue();
- }
+ if (!opIsLive)
+ for (const SDNode *User : right->uses()) {
+ int orderNo3 = User->getIROrder();
+ if (orderNo3 > orderNo) {
+ opIsLive = true;
+ break;
+ }
+ }
- return DAG.getNode(ISD::FMA, SDLoc(N), VT,
- N0.getOperand(0), N0.getOperand(1), N1);
+ if (!opIsLive)
+ return SDValue();
}
+
+ return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
+ N0.getOperand(1), N1);
}
return SDValue();
@@ -5332,18 +5360,44 @@ static SDValue PerformStoreRetvalCombine(SDNode *N) {
///
static SDValue PerformADDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
- const NVPTXSubtarget &Subtarget,
+ CodeGenOptLevel OptLevel) {
+ if (OptLevel == CodeGenOptLevel::None)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Skip non-integer, non-scalar case
+ EVT VT = N0.getValueType();
+ if (VT.isVector() || VT != MVT::i32)
+ return SDValue();
+
+ // First try with the default operand order.
+ if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
+ return Result;
+
+ // If that didn't work, try again with the operands commuted.
+ return PerformADDCombineWithOperands(N, N1, N0, DCI);
+}
+
+/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
+///
+static SDValue PerformFADDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
CodeGenOptLevel OptLevel) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+ if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
+ return SDValue();
+
// First try with the default operand order.
- if (SDValue Result =
- PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
+ if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
return Result;
// If that didn't work, try again with the operands commuted.
- return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+ return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
}
static SDValue PerformANDCombine(SDNode *N,
@@ -5876,8 +5930,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
default: break;
case ISD::ADD:
+ return PerformADDCombine(N, DCI, OptLevel);
case ISD::FADD:
- return PerformADDCombine(N, DCI, STI, OptLevel);
+ return PerformFADDCombine(N, DCI, OptLevel);
case ISD::MUL:
return PerformMULCombine(N, DCI, OptLevel);
case ISD::SHL:
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 440af085cb8e..c81dfa68e4bd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -265,7 +265,7 @@ multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntO
// activemask.b32
def ACTIVEMASK : NVPTXInst<(outs Int32Regs:$dest), (ins),
- "activemask.b32 \t$dest;",
+ "activemask.b32 \t$dest;",
[(set Int32Regs:$dest, (int_nvvm_activemask))]>,
Requires<[hasPTX<62>, hasSM<30>]>;
@@ -1618,18 +1618,18 @@ multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr, s
// atom_add
-def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_add_32 node:$a, node:$b)>;
-def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_add_32 node:$a, node:$b)>;
-def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_add_32 node:$a, node:$b)>;
-def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_add_64 node:$a, node:$b)>;
-def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_add_64 node:$a, node:$b)>;
-def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_i32 node:$a, node:$b)>;
+def atomic_load_add_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_i32 node:$a, node:$b)>;
+def atomic_load_add_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_i32 node:$a, node:$b)>;
+def atomic_load_add_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_i64 node:$a, node:$b)>;
+def atomic_load_add_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_i64 node:$a, node:$b)>;
+def atomic_load_add_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_i64 node:$a, node:$b)>;
def atomic_load_add_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
(atomic_load_fadd node:$a, node:$b)>;
def atomic_load_add_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
@@ -1638,22 +1638,22 @@ def atomic_load_add_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_fadd node:$a, node:$b)>;
defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", ".add",
- atomic_load_add_32_g, i32imm, imm>;
+ atomic_load_add_i32_g, i32imm, imm>;
defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", ".add",
- atomic_load_add_32_s, i32imm, imm>;
+ atomic_load_add_i32_s, i32imm, imm>;
defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".add",
- atomic_load_add_32_gen, i32imm, imm>;
+ atomic_load_add_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32",
- ".add", atomic_load_add_32_gen, i32imm, imm>;
+ ".add", atomic_load_add_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", ".add",
- atomic_load_add_64_g, i64imm, imm>;
+ atomic_load_add_i64_g, i64imm, imm>;
defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64", ".add",
- atomic_load_add_64_s, i64imm, imm>;
+ atomic_load_add_i64_s, i64imm, imm>;
defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".add",
- atomic_load_add_64_gen, i64imm, imm>;
+ atomic_load_add_i64_gen, i64imm, imm>;
defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
- ".add", atomic_load_add_64_gen, i64imm, imm>;
+ ".add", atomic_load_add_i64_gen, i64imm, imm>;
defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2<f16, Int16Regs, ".global", ".f16", ".add.noftz",
atomic_load_add_g, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
@@ -1685,187 +1685,187 @@ defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<f64, Float64Regs, "", ".f64", ".add",
// atom_sub
-def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_sub_32 node:$a, node:$b)>;
-def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_sub_32 node:$a, node:$b)>;
-def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_sub_32 node:$a, node:$b)>;
-def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_sub_64 node:$a, node:$b)>;
-def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_sub_64 node:$a, node:$b)>;
-def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_i32 node:$a, node:$b)>;
+def atomic_load_sub_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_i32 node:$a, node:$b)>;
+def atomic_load_sub_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_i32 node:$a, node:$b)>;
+def atomic_load_sub_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_i64 node:$a, node:$b)>;
+def atomic_load_sub_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_i64 node:$a, node:$b)>;
+def atomic_load_sub_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_i64 node:$a, node:$b)>;
defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<i32, Int32Regs, ".global", "32", ".add",
- atomic_load_sub_32_g>;
+ atomic_load_sub_i32_g>;
defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<i64, Int64Regs, ".global", "64", ".add",
- atomic_load_sub_64_g>;
+ atomic_load_sub_i64_g>;
defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<i32, Int32Regs, "", "32", ".add",
- atomic_load_sub_32_gen>;
+ atomic_load_sub_i32_gen>;
defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<i32, Int32Regs, ".global", "32",
- ".add", atomic_load_sub_32_gen>;
+ ".add", atomic_load_sub_i32_gen>;
defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<i32, Int32Regs, ".shared", "32", ".add",
- atomic_load_sub_32_s>;
+ atomic_load_sub_i32_s>;
defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<i64, Int64Regs, ".shared", "64", ".add",
- atomic_load_sub_64_s>;
+ atomic_load_sub_i64_s>;
defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<i64, Int64Regs, "", "64", ".add",
- atomic_load_sub_64_gen>;
+ atomic_load_sub_i64_gen>;
defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<i64, Int64Regs, ".global", "64",
- ".add", atomic_load_sub_64_gen>;
+ ".add", atomic_load_sub_i64_gen>;
// atom_swap
-def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_swap_32 node:$a, node:$b)>;
-def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_swap_32 node:$a, node:$b)>;
-def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_swap_32 node:$a, node:$b)>;
-def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_swap_64 node:$a, node:$b)>;
-def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_swap_64 node:$a, node:$b)>;
-def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_swap_i32 node:$a, node:$b)>;
+def atomic_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_swap_i32 node:$a, node:$b)>;
+def atomic_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_swap_i32 node:$a, node:$b)>;
+def atomic_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_swap_i64 node:$a, node:$b)>;
+def atomic_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_swap_i64 node:$a, node:$b)>;
+def atomic_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_swap_i64 node:$a, node:$b)>;
defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".exch",
- atomic_swap_32_g, i32imm, imm>;
+ atomic_swap_i32_g, i32imm, imm>;
defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".exch",
- atomic_swap_32_s, i32imm, imm>;
+ atomic_swap_i32_s, i32imm, imm>;
defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".exch",
- atomic_swap_32_gen, i32imm, imm>;
+ atomic_swap_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32",
- ".exch", atomic_swap_32_gen, i32imm, imm>;
+ ".exch", atomic_swap_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".exch",
- atomic_swap_64_g, i64imm, imm>;
+ atomic_swap_i64_g, i64imm, imm>;
defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".exch",
- atomic_swap_64_s, i64imm, imm>;
+ atomic_swap_i64_s, i64imm, imm>;
defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".exch",
- atomic_swap_64_gen, i64imm, imm>;
+ atomic_swap_i64_gen, i64imm, imm>;
defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
- ".exch", atomic_swap_64_gen, i64imm, imm>;
+ ".exch", atomic_swap_i64_gen, i64imm, imm>;
// atom_max
-def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
- , (atomic_load_max_32 node:$a, node:$b)>;
-def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_max_32 node:$a, node:$b)>;
-def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_max_32 node:$a, node:$b)>;
-def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
- , (atomic_load_max_64 node:$a, node:$b)>;
-def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_max_64 node:$a, node:$b)>;
-def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_max_64 node:$a, node:$b)>;
-def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_umax_32 node:$a, node:$b)>;
-def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_umax_32 node:$a, node:$b)>;
-def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_umax_32 node:$a, node:$b)>;
-def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_umax_64 node:$a, node:$b)>;
-def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_umax_64 node:$a, node:$b)>;
-def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_max_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+ , (atomic_load_max_i32 node:$a, node:$b)>;
+def atomic_load_max_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_i32 node:$a, node:$b)>;
+def atomic_load_max_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_i32 node:$a, node:$b)>;
+def atomic_load_max_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+ , (atomic_load_max_i64 node:$a, node:$b)>;
+def atomic_load_max_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_i64 node:$a, node:$b)>;
+def atomic_load_max_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_i64 node:$a, node:$b)>;
+def atomic_load_umax_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_i32 node:$a, node:$b)>;
+def atomic_load_umax_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_i32 node:$a, node:$b)>;
+def atomic_load_umax_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_i32 node:$a, node:$b)>;
+def atomic_load_umax_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_i64 node:$a, node:$b)>;
+def atomic_load_umax_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_i64 node:$a, node:$b)>;
+def atomic_load_umax_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_i64 node:$a, node:$b)>;
defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".s32",
- ".max", atomic_load_max_32_g, i32imm, imm>;
+ ".max", atomic_load_max_i32_g, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".s32",
- ".max", atomic_load_max_32_s, i32imm, imm>;
+ ".max", atomic_load_max_i32_s, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".s32", ".max",
- atomic_load_max_32_gen, i32imm, imm>;
+ atomic_load_max_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global",
- ".s32", ".max", atomic_load_max_32_gen, i32imm, imm>;
+ ".s32", ".max", atomic_load_max_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".s64",
- ".max", atomic_load_max_64_g, i64imm, imm, [hasSM<32>]>;
+ ".max", atomic_load_max_i64_g, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".s64",
- ".max", atomic_load_max_64_s, i64imm, imm, [hasSM<32>]>;
+ ".max", atomic_load_max_i64_s, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".s64", ".max",
- atomic_load_max_64_gen, i64imm, imm, [hasSM<32>]>;
+ atomic_load_max_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global",
- ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, [hasSM<32>]>;
+ ".s64", ".max", atomic_load_max_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32",
- ".max", atomic_load_umax_32_g, i32imm, imm>;
+ ".max", atomic_load_umax_i32_g, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32",
- ".max", atomic_load_umax_32_s, i32imm, imm>;
+ ".max", atomic_load_umax_i32_s, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".max",
- atomic_load_umax_32_gen, i32imm, imm>;
+ atomic_load_umax_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global",
- ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm>;
+ ".u32", ".max", atomic_load_umax_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
- ".max", atomic_load_umax_64_g, i64imm, imm, [hasSM<32>]>;
+ ".max", atomic_load_umax_i64_g, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64",
- ".max", atomic_load_umax_64_s, i64imm, imm, [hasSM<32>]>;
+ ".max", atomic_load_umax_i64_s, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".max",
- atomic_load_umax_64_gen, i64imm, imm, [hasSM<32>]>;
+ atomic_load_umax_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global",
- ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, [hasSM<32>]>;
+ ".u64", ".max", atomic_load_umax_i64_gen, i64imm, imm, [hasSM<32>]>;
// atom_min
-def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_min_32 node:$a, node:$b)>;
-def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_min_32 node:$a, node:$b)>;
-def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_min_32 node:$a, node:$b)>;
-def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_min_64 node:$a, node:$b)>;
-def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_min_64 node:$a, node:$b)>;
-def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_min_64 node:$a, node:$b)>;
-def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_umin_32 node:$a, node:$b)>;
-def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_umin_32 node:$a, node:$b)>;
-def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_umin_32 node:$a, node:$b)>;
-def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_umin_64 node:$a, node:$b)>;
-def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_umin_64 node:$a, node:$b)>;
-def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_min_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_i32 node:$a, node:$b)>;
+def atomic_load_min_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_i32 node:$a, node:$b)>;
+def atomic_load_min_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_i32 node:$a, node:$b)>;
+def atomic_load_min_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_i64 node:$a, node:$b)>;
+def atomic_load_min_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_i64 node:$a, node:$b)>;
+def atomic_load_min_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_i64 node:$a, node:$b)>;
+def atomic_load_umin_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_i32 node:$a, node:$b)>;
+def atomic_load_umin_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_i32 node:$a, node:$b)>;
+def atomic_load_umin_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_i32 node:$a, node:$b)>;
+def atomic_load_umin_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_i64 node:$a, node:$b)>;
+def atomic_load_umin_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_i64 node:$a, node:$b)>;
+def atomic_load_umin_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_i64 node:$a, node:$b)>;
defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".s32",
- ".min", atomic_load_min_32_g, i32imm, imm>;
+ ".min", atomic_load_min_i32_g, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".s32",
- ".min", atomic_load_min_32_s, i32imm, imm>;
+ ".min", atomic_load_min_i32_s, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".s32", ".min",
- atomic_load_min_32_gen, i32imm, imm>;
+ atomic_load_min_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global",
- ".s32", ".min", atomic_load_min_32_gen, i32imm, imm>;
+ ".s32", ".min", atomic_load_min_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".s64",
- ".min", atomic_load_min_64_g, i64imm, imm, [hasSM<32>]>;
+ ".min", atomic_load_min_i64_g, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".s64",
- ".min", atomic_load_min_64_s, i64imm, imm, [hasSM<32>]>;
+ ".min", atomic_load_min_i64_s, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".s64", ".min",
- atomic_load_min_64_gen, i64imm, imm, [hasSM<32>]>;
+ atomic_load_min_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global",
- ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, [hasSM<32>]>;
+ ".s64", ".min", atomic_load_min_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32",
- ".min", atomic_load_umin_32_g, i32imm, imm>;
+ ".min", atomic_load_umin_i32_g, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32",
- ".min", atomic_load_umin_32_s, i32imm, imm>;
+ ".min", atomic_load_umin_i32_s, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".min",
- atomic_load_umin_32_gen, i32imm, imm>;
+ atomic_load_umin_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global",
- ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm>;
+ ".u32", ".min", atomic_load_umin_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
- ".min", atomic_load_umin_64_g, i64imm, imm, [hasSM<32>]>;
+ ".min", atomic_load_umin_i64_g, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64",
- ".min", atomic_load_umin_64_s, i64imm, imm, [hasSM<32>]>;
+ ".min", atomic_load_umin_i64_s, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".min",
- atomic_load_umin_64_gen, i64imm, imm, [hasSM<32>]>;
+ atomic_load_umin_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global",
- ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, [hasSM<32>]>;
+ ".u64", ".min", atomic_load_umin_i64_gen, i64imm, imm, [hasSM<32>]>;
// atom_inc atom_dec
@@ -1901,131 +1901,131 @@ defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32
// atom_and
-def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_and_32 node:$a, node:$b)>;
-def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_and_32 node:$a, node:$b)>;
-def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_and_32 node:$a, node:$b)>;
-def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_and_64 node:$a, node:$b)>;
-def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_and_64 node:$a, node:$b)>;
-def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_i32 node:$a, node:$b)>;
+def atomic_load_and_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_i32 node:$a, node:$b)>;
+def atomic_load_and_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_i32 node:$a, node:$b)>;
+def atomic_load_and_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_i64 node:$a, node:$b)>;
+def atomic_load_and_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_i64 node:$a, node:$b)>;
+def atomic_load_and_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_i64 node:$a, node:$b)>;
defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".and",
- atomic_load_and_32_g, i32imm, imm>;
+ atomic_load_and_i32_g, i32imm, imm>;
defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".and",
- atomic_load_and_32_s, i32imm, imm>;
+ atomic_load_and_i32_s, i32imm, imm>;
defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".and",
- atomic_load_and_32_gen, i32imm, imm>;
+ atomic_load_and_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32",
- ".and", atomic_load_and_32_gen, i32imm, imm>;
+ ".and", atomic_load_and_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".and",
- atomic_load_and_64_g, i64imm, imm, [hasSM<32>]>;
+ atomic_load_and_i64_g, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".and",
- atomic_load_and_64_s, i64imm, imm, [hasSM<32>]>;
+ atomic_load_and_i64_s, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".and",
- atomic_load_and_64_gen, i64imm, imm, [hasSM<32>]>;
+ atomic_load_and_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
- ".and", atomic_load_and_64_gen, i64imm, imm, [hasSM<32>]>;
+ ".and", atomic_load_and_i64_gen, i64imm, imm, [hasSM<32>]>;
// atom_or
-def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_or_32 node:$a, node:$b)>;
-def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_or_32 node:$a, node:$b)>;
-def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_or_32 node:$a, node:$b)>;
-def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_or_64 node:$a, node:$b)>;
-def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_or_64 node:$a, node:$b)>;
-def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_i32 node:$a, node:$b)>;
+def atomic_load_or_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_i32 node:$a, node:$b)>;
+def atomic_load_or_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_i32 node:$a, node:$b)>;
+def atomic_load_or_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_i64 node:$a, node:$b)>;
+def atomic_load_or_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_i64 node:$a, node:$b)>;
+def atomic_load_or_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_i64 node:$a, node:$b)>;
defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".or",
- atomic_load_or_32_g, i32imm, imm>;
+ atomic_load_or_i32_g, i32imm, imm>;
defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".or",
- atomic_load_or_32_gen, i32imm, imm>;
+ atomic_load_or_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32",
- ".or", atomic_load_or_32_gen, i32imm, imm>;
+ ".or", atomic_load_or_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".or",
- atomic_load_or_32_s, i32imm, imm>;
+ atomic_load_or_i32_s, i32imm, imm>;
defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".or",
- atomic_load_or_64_g, i64imm, imm, [hasSM<32>]>;
+ atomic_load_or_i64_g, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".or",
- atomic_load_or_64_gen, i64imm, imm, [hasSM<32>]>;
+ atomic_load_or_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
- ".or", atomic_load_or_64_gen, i64imm, imm, [hasSM<32>]>;
+ ".or", atomic_load_or_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".or",
- atomic_load_or_64_s, i64imm, imm, [hasSM<32>]>;
+ atomic_load_or_i64_s, i64imm, imm, [hasSM<32>]>;
// atom_xor
-def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_xor_32 node:$a, node:$b)>;
-def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_xor_32 node:$a, node:$b)>;
-def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_xor_32 node:$a, node:$b)>;
-def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
- (atomic_load_xor_64 node:$a, node:$b)>;
-def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
- (atomic_load_xor_64 node:$a, node:$b)>;
-def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
- (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_i32 node:$a, node:$b)>;
+def atomic_load_xor_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_i32 node:$a, node:$b)>;
+def atomic_load_xor_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_i32 node:$a, node:$b)>;
+def atomic_load_xor_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_i64 node:$a, node:$b)>;
+def atomic_load_xor_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_i64 node:$a, node:$b)>;
+def atomic_load_xor_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_i64 node:$a, node:$b)>;
defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".xor",
- atomic_load_xor_32_g, i32imm, imm>;
+ atomic_load_xor_i32_g, i32imm, imm>;
defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".xor",
- atomic_load_xor_32_s, i32imm, imm>;
+ atomic_load_xor_i32_s, i32imm, imm>;
defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".xor",
- atomic_load_xor_32_gen, i32imm, imm>;
+ atomic_load_xor_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32",
- ".xor", atomic_load_xor_32_gen, i32imm, imm>;
+ ".xor", atomic_load_xor_i32_gen, i32imm, imm>;
defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".xor",
- atomic_load_xor_64_g, i64imm, imm, [hasSM<32>]>;
+ atomic_load_xor_i64_g, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".xor",
- atomic_load_xor_64_s, i64imm, imm, [hasSM<32>]>;
+ atomic_load_xor_i64_s, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor",
- atomic_load_xor_64_gen, i64imm, imm, [hasSM<32>]>;
+ atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;
defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
- ".xor", atomic_load_xor_64_gen, i64imm, imm, [hasSM<32>]>;
+ ".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;
// atom_cas
-def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
- atomic_cmp_swap_32_g, i32imm>;
+ atomic_cmp_swap_i32_g, i32imm>;
defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
- atomic_cmp_swap_32_s, i32imm>;
+ atomic_cmp_swap_i32_s, i32imm>;
defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<i32, Int32Regs, "", ".b32", ".cas",
- atomic_cmp_swap_32_gen, i32imm>;
+ atomic_cmp_swap_i32_gen, i32imm>;
defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32",
- ".cas", atomic_cmp_swap_32_gen, i32imm>;
+ ".cas", atomic_cmp_swap_i32_gen, i32imm>;
defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", ".cas",
- atomic_cmp_swap_64_g, i64imm>;
+ atomic_cmp_swap_i64_g, i64imm>;
defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<i64, Int64Regs, ".shared", ".b64", ".cas",
- atomic_cmp_swap_64_s, i64imm>;
+ atomic_cmp_swap_i64_s, i64imm>;
defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<i64, Int64Regs, "", ".b64", ".cas",
- atomic_cmp_swap_64_gen, i64imm>;
+ atomic_cmp_swap_i64_gen, i64imm>;
defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64",
- ".cas", atomic_cmp_swap_64_gen, i64imm>;
+ ".cas", atomic_cmp_swap_i64_gen, i64imm>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2475,6 +2475,7 @@ defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen, useShortPtrLocal>
defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen, useShortPtrShared>;
defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen, False>;
defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen, useShortPtrConst>;
+defm cvta_param : NG_TO_G<"param", int_nvvm_ptr_param_to_gen, False>;
defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local, useShortPtrLocal>;
defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared, useShortPtrShared>;
@@ -6724,6 +6725,7 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
# FragC.regstring # ";";
}
+let isConvergent = true in {
defset list<WMMA_INSTR> WMMAs = {
foreach layout_a = ["row", "col"] in {
foreach layout_b = ["row", "col"] in {
@@ -6745,6 +6747,7 @@ defset list<WMMA_INSTR> WMMAs = {
} // layout_b
} // layout_a
} // defset
+}
// MMA
class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
@@ -6774,6 +6777,7 @@ class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
# FragC.regstring # ";";
}
+let isConvergent = true in {
defset list<WMMA_INSTR> MMAs = {
foreach layout_a = ["row", "col"] in {
foreach layout_b = ["row", "col"] in {
@@ -6793,6 +6797,7 @@ defset list<WMMA_INSTR> MMAs = {
} // layout_b
} // layout_a
} // defset
+}
//
// ldmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index cde02c25c483..e63c7a61c6f2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -95,7 +95,9 @@
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
@@ -336,8 +338,9 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
while (!ValuesToCheck.empty()) {
Value *V = ValuesToCheck.pop_back_val();
if (!IsALoadChainInstr(V)) {
- LLVM_DEBUG(dbgs() << "Need a copy of " << *Arg << " because of " << *V
- << "\n");
+ LLVM_DEBUG(dbgs() << "Need a "
+ << (isParamGridConstant(*Arg) ? "cast " : "copy ")
+ << "of " << *Arg << " because of " << *V << "\n");
(void)Arg;
return false;
}
@@ -366,27 +369,59 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
return;
}
- // Otherwise we have to create a temporary copy.
const DataLayout &DL = Func->getParent()->getDataLayout();
unsigned AS = DL.getAllocaAddrSpace();
- AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
- // Set the alignment to alignment of the byval parameter. This is because,
- // later load/stores assume that alignment, and we are going to replace
- // the use of the byval parameter with this alloca instruction.
- AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
- .value_or(DL.getPrefTypeAlign(StructType)));
- Arg->replaceAllUsesWith(AllocA);
-
- Value *ArgInParam = new AddrSpaceCastInst(
- Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
- FirstInst);
- // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
- // addrspacecast preserves alignment. Since params are constant, this load is
- // definitely not volatile.
- LoadInst *LI =
- new LoadInst(StructType, ArgInParam, Arg->getName(),
- /*isVolatile=*/false, AllocA->getAlign(), FirstInst);
- new StoreInst(LI, AllocA, FirstInst);
+ if (isParamGridConstant(*Arg)) {
+ // Writes to a grid constant are undefined behaviour. We do not need a
+ // temporary copy. When a pointer might have escaped, conservatively replace
+ // all of its uses (which might include a device function call) with a cast
+ // to the generic address space.
+ // TODO: only cast byval grid constant parameters at use points that need
+ // generic address (e.g., merging parameter pointers with other address
+ // space, or escaping to call-sites, inline-asm, memory), and use the
+ // parameter address space for normal loads.
+ IRBuilder<> IRB(&Func->getEntryBlock().front());
+
+ // Cast argument to param address space
+ auto *CastToParam =
+ cast<AddrSpaceCastInst>(IRB.CreateAddrSpaceCast(
+ Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param"));
+
+ // Cast param address to generic address space. We do not use an
+ // addrspacecast to generic here, because, LLVM considers `Arg` to be in the
+ // generic address space, and a `generic -> param` cast followed by a `param
+ // -> generic` cast will be folded away. The `param -> generic` intrinsic
+ // will be correctly lowered to `cvta.param`.
+ Value *CvtToGenCall = IRB.CreateIntrinsic(
+ IRB.getPtrTy(ADDRESS_SPACE_GENERIC), Intrinsic::nvvm_ptr_param_to_gen,
+ CastToParam, nullptr, CastToParam->getName() + ".gen");
+
+ Arg->replaceAllUsesWith(CvtToGenCall);
+
+ // Do not replace Arg in the cast to param space
+ CastToParam->setOperand(0, Arg);
+ } else {
+ // Otherwise we have to create a temporary copy.
+ AllocaInst *AllocA =
+ new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
+ // Set the alignment to alignment of the byval parameter. This is because,
+ // later load/stores assume that alignment, and we are going to replace
+ // the use of the byval parameter with this alloca instruction.
+ AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
+ .value_or(DL.getPrefTypeAlign(StructType)));
+ Arg->replaceAllUsesWith(AllocA);
+
+ Value *ArgInParam = new AddrSpaceCastInst(
+ Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM),
+ Arg->getName(), FirstInst);
+ // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
+ // addrspacecast preserves alignment. Since params are constant, this load
+ // is definitely not volatile.
+ LoadInst *LI =
+ new LoadInst(StructType, ArgInParam, Arg->getName(),
+ /*isVolatile=*/false, AllocA->getAlign(), FirstInst);
+ new StoreInst(LI, AllocA, FirstInst);
+ }
}
void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index b60a1d747af7..152f200b9d0f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -224,8 +224,7 @@ void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
AAM.registerFunctionAnalysis<NVPTXAA>();
}
-void NVPTXTargetMachine::registerPassBuilderCallbacks(
- PassBuilder &PB, bool PopulateClassToPassNames) {
+void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "NVPTXPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index 870ea20c26f3..2b88da67a50f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -66,8 +66,7 @@ public:
void registerDefaultAliasAnalyses(AAManager &AAM) override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool PopulateClassToPassNames) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 3a536db1c972..e4b2ec868519 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -52,29 +52,46 @@ void clearAnnotationCache(const Module *Mod) {
AC.Cache.erase(Mod);
}
-static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+static void readIntVecFromMDNode(const MDNode *MetadataNode,
+ std::vector<unsigned> &Vec) {
+ for (unsigned i = 0, e = MetadataNode->getNumOperands(); i != e; ++i) {
+ ConstantInt *Val =
+ mdconst::extract<ConstantInt>(MetadataNode->getOperand(i));
+ Vec.push_back(Val->getZExtValue());
+ }
+}
+
+static void cacheAnnotationFromMD(const MDNode *MetadataNode,
+ key_val_pair_t &retval) {
auto &AC = getAnnotationCache();
std::lock_guard<sys::Mutex> Guard(AC.Lock);
- assert(md && "Invalid mdnode for annotation");
- assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+ assert(MetadataNode && "Invalid mdnode for annotation");
+ assert((MetadataNode->getNumOperands() % 2) == 1 &&
+ "Invalid number of operands");
// start index = 1, to skip the global variable key
// increment = 2, to skip the value for each property-value pairs
- for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+ for (unsigned i = 1, e = MetadataNode->getNumOperands(); i != e; i += 2) {
// property
- const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+ const MDString *prop = dyn_cast<MDString>(MetadataNode->getOperand(i));
assert(prop && "Annotation property not a string");
+ std::string Key = prop->getString().str();
// value
- ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(md->getOperand(i + 1));
- assert(Val && "Value operand not a constant int");
-
- std::string keyname = prop->getString().str();
- if (retval.find(keyname) != retval.end())
- retval[keyname].push_back(Val->getZExtValue());
- else {
- std::vector<unsigned> tmp;
- tmp.push_back(Val->getZExtValue());
- retval[keyname] = tmp;
+ if (ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(
+ MetadataNode->getOperand(i + 1))) {
+ retval[Key].push_back(Val->getZExtValue());
+ } else if (MDNode *VecMd =
+ dyn_cast<MDNode>(MetadataNode->getOperand(i + 1))) {
+ // note: only "grid_constant" annotations support vector MDNodes.
+ // assert: there can only exist one unique key value pair of
+ // the form (string key, MDNode node). Operands of such a node
+ // shall always be unsigned ints.
+ if (retval.find(Key) == retval.end()) {
+ readIntVecFromMDNode(VecMd, retval[Key]);
+ continue;
+ }
+ } else {
+ llvm_unreachable("Value operand not a constant int or an mdnode");
}
}
}
@@ -153,9 +170,9 @@ bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
bool isTexture(const Value &val) {
if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
- unsigned annot;
- if (findOneNVVMAnnotation(gv, "texture", annot)) {
- assert((annot == 1) && "Unexpected annotation on a texture symbol");
+ unsigned Annot;
+ if (findOneNVVMAnnotation(gv, "texture", Annot)) {
+ assert((Annot == 1) && "Unexpected annotation on a texture symbol");
return true;
}
}
@@ -164,70 +181,67 @@ bool isTexture(const Value &val) {
bool isSurface(const Value &val) {
if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
- unsigned annot;
- if (findOneNVVMAnnotation(gv, "surface", annot)) {
- assert((annot == 1) && "Unexpected annotation on a surface symbol");
+ unsigned Annot;
+ if (findOneNVVMAnnotation(gv, "surface", Annot)) {
+ assert((Annot == 1) && "Unexpected annotation on a surface symbol");
return true;
}
}
return false;
}
-bool isSampler(const Value &val) {
- const char *AnnotationName = "sampler";
-
- if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
- unsigned annot;
- if (findOneNVVMAnnotation(gv, AnnotationName, annot)) {
- assert((annot == 1) && "Unexpected annotation on a sampler symbol");
- return true;
- }
- }
- if (const Argument *arg = dyn_cast<Argument>(&val)) {
- const Function *func = arg->getParent();
- std::vector<unsigned> annot;
- if (findAllNVVMAnnotation(func, AnnotationName, annot)) {
- if (is_contained(annot, arg->getArgNo()))
+static bool argHasNVVMAnnotation(const Value &Val,
+ const std::string &Annotation,
+ const bool StartArgIndexAtOne = false) {
+ if (const Argument *Arg = dyn_cast<Argument>(&Val)) {
+ const Function *Func = Arg->getParent();
+ std::vector<unsigned> Annot;
+ if (findAllNVVMAnnotation(Func, Annotation, Annot)) {
+ const unsigned BaseOffset = StartArgIndexAtOne ? 1 : 0;
+ if (is_contained(Annot, BaseOffset + Arg->getArgNo())) {
return true;
+ }
}
}
return false;
}
-bool isImageReadOnly(const Value &val) {
- if (const Argument *arg = dyn_cast<Argument>(&val)) {
- const Function *func = arg->getParent();
- std::vector<unsigned> annot;
- if (findAllNVVMAnnotation(func, "rdoimage", annot)) {
- if (is_contained(annot, arg->getArgNo()))
- return true;
+bool isParamGridConstant(const Value &V) {
+ if (const Argument *Arg = dyn_cast<Argument>(&V)) {
+ // "grid_constant" counts argument indices starting from 1
+ if (Arg->hasByValAttr() &&
+ argHasNVVMAnnotation(*Arg, "grid_constant", /*StartArgIndexAtOne*/true)) {
+ assert(isKernelFunction(*Arg->getParent()) &&
+ "only kernel arguments can be grid_constant");
+ return true;
}
}
return false;
}
-bool isImageWriteOnly(const Value &val) {
- if (const Argument *arg = dyn_cast<Argument>(&val)) {
- const Function *func = arg->getParent();
- std::vector<unsigned> annot;
- if (findAllNVVMAnnotation(func, "wroimage", annot)) {
- if (is_contained(annot, arg->getArgNo()))
- return true;
+bool isSampler(const Value &val) {
+ const char *AnnotationName = "sampler";
+
+ if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+ unsigned Annot;
+ if (findOneNVVMAnnotation(gv, AnnotationName, Annot)) {
+ assert((Annot == 1) && "Unexpected annotation on a sampler symbol");
+ return true;
}
}
- return false;
+ return argHasNVVMAnnotation(val, AnnotationName);
+}
+
+bool isImageReadOnly(const Value &val) {
+ return argHasNVVMAnnotation(val, "rdoimage");
+}
+
+bool isImageWriteOnly(const Value &val) {
+ return argHasNVVMAnnotation(val, "wroimage");
}
bool isImageReadWrite(const Value &val) {
- if (const Argument *arg = dyn_cast<Argument>(&val)) {
- const Function *func = arg->getParent();
- std::vector<unsigned> annot;
- if (findAllNVVMAnnotation(func, "rdwrimage", annot)) {
- if (is_contained(annot, arg->getArgNo()))
- return true;
- }
- }
- return false;
+ return argHasNVVMAnnotation(val, "rdwrimage");
}
bool isImage(const Value &val) {
@@ -236,9 +250,9 @@ bool isImage(const Value &val) {
bool isManaged(const Value &val) {
if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
- unsigned annot;
- if (findOneNVVMAnnotation(gv, "managed", annot)) {
- assert((annot == 1) && "Unexpected annotation on a managed symbol");
+ unsigned Annot;
+ if (findOneNVVMAnnotation(gv, "managed", Annot)) {
+ assert((Annot == 1) && "Unexpected annotation on a managed symbol");
return true;
}
}
@@ -323,8 +337,7 @@ bool getMaxNReg(const Function &F, unsigned &x) {
bool isKernelFunction(const Function &F) {
unsigned x = 0;
- bool retval = findOneNVVMAnnotation(&F, "kernel", x);
- if (!retval) {
+ if (!findOneNVVMAnnotation(&F, "kernel", x)) {
// There is no NVVM metadata, check the calling convention
return F.getCallingConv() == CallingConv::PTX_Kernel;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index e020bc0f02e9..c15ff6cae1f2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -62,6 +62,7 @@ bool getMaxClusterRank(const Function &, unsigned &);
bool getMinCTASm(const Function &, unsigned &);
bool getMaxNReg(const Function &, unsigned &);
bool isKernelFunction(const Function &);
+bool isParamGridConstant(const Value &);
MaybeAlign getAlign(const Function &, unsigned);
MaybeAlign getAlign(const CallInst &, unsigned);
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 29c95e4226bf..4024953bb51d 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -209,7 +209,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
// Removing via isInstructionTriviallyDead may add duplicates to the ToRemove
// array. Filter out the duplicates before starting to erase from parent.
std::sort(ToRemove.begin(), ToRemove.end());
- auto NewLastIter = std::unique(ToRemove.begin(), ToRemove.end());
+ auto NewLastIter = llvm::unique(ToRemove);
ToRemove.erase(NewLastIter, ToRemove.end());
for (Instruction *I : ToRemove)
diff --git a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 799890928577..53b2d2aa8624 100644
--- a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -165,8 +165,8 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -195,8 +195,8 @@ FunctionPass *llvm::createPPCBranchCoalescingPass() {
INITIALIZE_PASS_BEGIN(PPCBranchCoalescing, DEBUG_TYPE,
"Branch Coalescing", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(PPCBranchCoalescing, DEBUG_TYPE, "Branch Coalescing",
false, false)
@@ -214,8 +214,8 @@ void PPCBranchCoalescing::CoalescingCandidateInfo::clear() {
}
void PPCBranchCoalescing::initialize(MachineFunction &MF) {
- MDT = &getAnalysis<MachineDominatorTree>();
- MPDT = &getAnalysis<MachinePostDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
TII = MF.getSubtarget().getInstrInfo();
MRI = &MF.getRegInfo();
}
diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
index 1f9947f6f327..c4190bb9a1c4 100644
--- a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
+++ b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
@@ -55,7 +55,7 @@ namespace {
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -70,7 +70,7 @@ namespace {
INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
"PowerPC CTR Loops Verify", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
"PowerPC CTR Loops Verify", false, false)
@@ -160,7 +160,7 @@ queue_preds:
}
bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
// Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
// any other instructions that might clobber the ctr register.
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 201b2d162372..277d708013c7 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -384,7 +384,10 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
}
void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
- bool is31 = needsFP(MF);
+ // When there is dynamic alloca in this function, we can not use the frame
+ // pointer X31/R31 for the frameaddress lowering. In this case, only X1/R1
+ // always points to the backchain.
+ bool is31 = needsFP(MF) && !MF.getFrameInfo().hasVarSizedObjects();
unsigned FPReg = is31 ? PPC::R31 : PPC::R1;
unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 12e33ddb8eb5..9e56b8522fa6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -5608,7 +5608,7 @@ static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
const SDLoc &dl) {
SDValue MTCTROps[] = {Chain, Callee, Glue};
EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
- Chain = DAG.getNode(PPCISD::MTCTR, dl, ArrayRef(ReturnTypes, 2),
+ Chain = DAG.getNode(PPCISD::MTCTR, dl, ReturnTypes,
ArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
// The glue is the second value produced.
Glue = Chain.getValue(1);
@@ -10937,10 +10937,10 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::ppc_mma_disassemble_acc: {
if (Subtarget.isISAFuture()) {
EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
- SDValue WideVec = SDValue(DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl,
- ArrayRef(ReturnTypes, 2),
- Op.getOperand(1)),
- 0);
+ SDValue WideVec =
+ SDValue(DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes,
+ Op.getOperand(1)),
+ 0);
SmallVector<SDValue, 4> RetOps;
SDValue Value = SDValue(WideVec.getNode(), 0);
SDValue Value2 = SDValue(WideVec.getNode(), 1);
@@ -11609,7 +11609,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
if (Subtarget.isISAFuture()) {
EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
MachineSDNode *ExtNode = DAG.getMachineNode(
- PPC::DMXXEXTFDMR512, dl, ArrayRef(ReturnTypes, 2), Op.getOperand(1));
+ PPC::DMXXEXTFDMR512, dl, ReturnTypes, Op.getOperand(1));
Value = SDValue(ExtNode, 0);
Value2 = SDValue(ExtNode, 1);
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index eda5eb975e70..8f5afbae01de 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -292,42 +292,42 @@ def : Pat<(PPCcall_nop_rm (i64 mcsym:$dst)),
let Defs = [CR0] in {
def ATOMIC_LOAD_ADD_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64",
- [(set i64:$dst, (atomic_load_add_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_add_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_SUB_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64",
- [(set i64:$dst, (atomic_load_sub_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_sub_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_OR_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64",
- [(set i64:$dst, (atomic_load_or_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_or_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_XOR_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64",
- [(set i64:$dst, (atomic_load_xor_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_xor_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_AND_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64",
- [(set i64:$dst, (atomic_load_and_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_and_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_NAND_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64",
- [(set i64:$dst, (atomic_load_nand_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_nand_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_MIN_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MIN_I64",
- [(set i64:$dst, (atomic_load_min_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_min_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_MAX_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MAX_I64",
- [(set i64:$dst, (atomic_load_max_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_max_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_UMIN_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMIN_I64",
- [(set i64:$dst, (atomic_load_umin_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_umin_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_LOAD_UMAX_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMAX_I64",
- [(set i64:$dst, (atomic_load_umax_64 ForceXForm:$ptr, i64:$incr))]>;
+ [(set i64:$dst, (atomic_load_umax_i64 ForceXForm:$ptr, i64:$incr))]>;
def ATOMIC_CMP_SWAP_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64",
- [(set i64:$dst, (atomic_cmp_swap_64 ForceXForm:$ptr, i64:$old, i64:$new))]>;
+ [(set i64:$dst, (atomic_cmp_swap_i64 ForceXForm:$ptr, i64:$old, i64:$new))]>;
def ATOMIC_SWAP_I64 : PPCCustomInserterPseudo<
(outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64",
- [(set i64:$dst, (atomic_swap_64 ForceXForm:$ptr, i64:$new))]>;
+ [(set i64:$dst, (atomic_swap_i64 ForceXForm:$ptr, i64:$new))]>;
}
// Instructions to support atomic operations
@@ -1036,7 +1036,7 @@ defm DIVDE : XOForm_1rcr<31, 425, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB),
let Predicates = [IsISA3_0] in {
def MADDHD : VAForm_1a<48, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
"maddhd $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
-def MADDHDU : VAForm_1a<49,
+def MADDHDU : VAForm_1a<49,
(outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
"maddhdu $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
def MADDLD : VAForm_1a<51, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB, gprc:$RC),
@@ -1044,7 +1044,7 @@ def MADDLD : VAForm_1a<51, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB, gprc:$RC),
[(set i32:$RT, (add_without_simm16 (mul_without_simm16 i32:$RA, i32:$RB), i32:$RC))]>,
isPPC64;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
- def MADDLD8 : VAForm_1a<51,
+ def MADDLD8 : VAForm_1a<51,
(outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
"maddld $RT, $RA, $RB, $RC", IIC_IntMulHD,
[(set i64:$RT, (add_without_simm16 (mul_without_simm16 i64:$RA, i64:$RB), i64:$RC))]>,
@@ -1349,8 +1349,8 @@ def LWZX8 : XForm_1_memOp<31, 23, (outs g8rc:$RST), (ins (memrr $RA, $RB):$addr
"lwzx $RST, $addr", IIC_LdStLoad,
[(set i64:$RST, (zextloadi32 XForm:$addr))]>,
ZExt32To64;
-
-
+
+
// Update forms.
let mayLoad = 1, hasSideEffects = 0 in {
def LBZU8 : DForm_1<35, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
@@ -1635,7 +1635,7 @@ def PADDIdtprel : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm
let PPC970_Unit = 2 in {
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
-// Truncating stores.
+// Truncating stores.
def STB8 : DForm_1<38, (outs), (ins g8rc:$RST, (memri $D, $RA):$addr),
"stb $RST, $addr", IIC_LdStStore,
[(truncstorei8 i64:$RST, DForm:$addr)]>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 09f829943528..1686249c0f89 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -723,7 +723,7 @@ def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">;
def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">;
// AIX assembler may not be modern enough to support some extended mne.
-def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">,
+def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">,
AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>;
def IsAIX : Predicate<"Subtarget->isAIXABI()">;
def NotAIX : Predicate<"!Subtarget->isAIXABI()">;
@@ -1747,114 +1747,114 @@ def : Pat<(int_ppc_dcbtst_with_hint xoaddr:$dst, i32:$TH),
let Defs = [CR0] in {
def ATOMIC_LOAD_ADD_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8",
- [(set i32:$dst, (atomic_load_add_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_add_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_SUB_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8",
- [(set i32:$dst, (atomic_load_sub_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_sub_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_AND_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8",
- [(set i32:$dst, (atomic_load_and_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_and_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_OR_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8",
- [(set i32:$dst, (atomic_load_or_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_or_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_XOR_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8",
- [(set i32:$dst, (atomic_load_xor_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_xor_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_NAND_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8",
- [(set i32:$dst, (atomic_load_nand_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_nand_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MIN_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8",
- [(set i32:$dst, (atomic_load_min_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_min_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MAX_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8",
- [(set i32:$dst, (atomic_load_max_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_max_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMIN_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8",
- [(set i32:$dst, (atomic_load_umin_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umin_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMAX_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8",
- [(set i32:$dst, (atomic_load_umax_8 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umax_i8 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_ADD_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16",
- [(set i32:$dst, (atomic_load_add_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_add_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_SUB_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16",
- [(set i32:$dst, (atomic_load_sub_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_sub_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_AND_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16",
- [(set i32:$dst, (atomic_load_and_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_and_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_OR_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16",
- [(set i32:$dst, (atomic_load_or_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_or_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_XOR_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16",
- [(set i32:$dst, (atomic_load_xor_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_xor_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_NAND_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16",
- [(set i32:$dst, (atomic_load_nand_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_nand_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MIN_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16",
- [(set i32:$dst, (atomic_load_min_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_min_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MAX_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16",
- [(set i32:$dst, (atomic_load_max_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_max_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMIN_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16",
- [(set i32:$dst, (atomic_load_umin_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umin_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMAX_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16",
- [(set i32:$dst, (atomic_load_umax_16 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umax_i16 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_ADD_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32",
- [(set i32:$dst, (atomic_load_add_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_add_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_SUB_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32",
- [(set i32:$dst, (atomic_load_sub_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_sub_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_AND_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32",
- [(set i32:$dst, (atomic_load_and_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_and_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_OR_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32",
- [(set i32:$dst, (atomic_load_or_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_or_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_XOR_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32",
- [(set i32:$dst, (atomic_load_xor_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_xor_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_NAND_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32",
- [(set i32:$dst, (atomic_load_nand_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_nand_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MIN_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32",
- [(set i32:$dst, (atomic_load_min_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_min_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_MAX_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32",
- [(set i32:$dst, (atomic_load_max_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_max_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMIN_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32",
- [(set i32:$dst, (atomic_load_umin_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umin_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_LOAD_UMAX_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32",
- [(set i32:$dst, (atomic_load_umax_32 ForceXForm:$ptr, i32:$incr))]>;
+ [(set i32:$dst, (atomic_load_umax_i32 ForceXForm:$ptr, i32:$incr))]>;
def ATOMIC_CMP_SWAP_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8",
- [(set i32:$dst, (atomic_cmp_swap_8 ForceXForm:$ptr, i32:$old, i32:$new))]>;
+ [(set i32:$dst, (atomic_cmp_swap_i8 ForceXForm:$ptr, i32:$old, i32:$new))]>;
def ATOMIC_CMP_SWAP_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
- [(set i32:$dst, (atomic_cmp_swap_16 ForceXForm:$ptr, i32:$old, i32:$new))]>;
+ [(set i32:$dst, (atomic_cmp_swap_i16 ForceXForm:$ptr, i32:$old, i32:$new))]>;
def ATOMIC_CMP_SWAP_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
- [(set i32:$dst, (atomic_cmp_swap_32 ForceXForm:$ptr, i32:$old, i32:$new))]>;
+ [(set i32:$dst, (atomic_cmp_swap_i32 ForceXForm:$ptr, i32:$old, i32:$new))]>;
def ATOMIC_SWAP_I8 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8",
- [(set i32:$dst, (atomic_swap_8 ForceXForm:$ptr, i32:$new))]>;
+ [(set i32:$dst, (atomic_swap_i8 ForceXForm:$ptr, i32:$new))]>;
def ATOMIC_SWAP_I16 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16",
- [(set i32:$dst, (atomic_swap_16 ForceXForm:$ptr, i32:$new))]>;
+ [(set i32:$dst, (atomic_swap_i16 ForceXForm:$ptr, i32:$new))]>;
def ATOMIC_SWAP_I32 : PPCCustomInserterPseudo<
(outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32",
- [(set i32:$dst, (atomic_swap_32 ForceXForm:$ptr, i32:$new))]>;
+ [(set i32:$dst, (atomic_swap_i32 ForceXForm:$ptr, i32:$new))]>;
}
def : Pat<(PPCatomicCmpSwap_8 ForceXForm:$ptr, i32:$old, i32:$new),
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index c6db8a7bbeb8..0b515c9f798f 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -156,12 +156,12 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveVariables>();
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addRequired<MachineBlockFrequencyInfo>();
AU.addPreserved<LiveVariables>();
- AU.addPreserved<MachineDominatorTree>();
- AU.addPreserved<MachinePostDominatorTree>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
AU.addPreserved<MachineBlockFrequencyInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -200,8 +200,8 @@ void PPCMIPeephole::addRegToUpdateWithLine(Register Reg, int Line) {
void PPCMIPeephole::initialize(MachineFunction &MFParm) {
MF = &MFParm;
MRI = &MF->getRegInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
- MPDT = &getAnalysis<MachinePostDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
LV = &getAnalysis<LiveVariables>();
EntryFreq = MBFI->getEntryFreq();
@@ -448,6 +448,9 @@ void PPCMIPeephole::convertUnprimedAccPHIs(
if (MRI->isSSA())
addRegToUpdate(RegMBB.first.getReg());
}
+ // The liveness of old PHI and new PHI have to be updated.
+ addRegToUpdate(PHI->getOperand(0).getReg());
+ addRegToUpdate(AccReg);
ChangedPHIMap[PHI] = NewPHI.getInstr();
LLVM_DEBUG(dbgs() << "Converting PHI: ");
LLVM_DEBUG(PHI->dump());
@@ -2029,8 +2032,8 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI,
INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE,
"PowerPC MI Peephole Optimization", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LiveVariables)
INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
"PowerPC MI Peephole Optimization", false, false)
diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 0504db239f67..d1cc2ad5c481 100644
--- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -427,7 +427,7 @@ public:
CRLogicalOpInfo createCRLogicalOpInfo(MachineInstr &MI);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineBranchProbabilityInfo>();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -730,7 +730,7 @@ void PPCReduceCRLogicals::collectCRLogicals() {
INITIALIZE_PASS_BEGIN(PPCReduceCRLogicals, DEBUG_TYPE,
"PowerPC Reduce CR logical Operation", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(PPCReduceCRLogicals, DEBUG_TYPE,
"PowerPC Reduce CR logical Operation", false, false)
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 8a37e40414ee..fdbdc14736c8 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -199,7 +199,7 @@ let SubRegIndices = [sub_vsx0, sub_vsx1] in {
def VSRp#!add(!srl(Index, 1), 16) :
VSRPair<!add(!srl(Index, 1), 16), "vsp"#!add(Index, 32),
[!cast<VR>("V"#Index), !cast<VR>("V"#!add(Index, 1))]>,
- DwarfRegNum<[-1, -1]>;
+ DwarfRegAlias<!cast<VR>("V"#Index)>;
}
}
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 0628fbb26245..bd9af12b30f5 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -38,11 +38,6 @@ using namespace llvm;
#include "PPCGenSubtargetInfo.inc"
static cl::opt<bool>
- UseSubRegLiveness("ppc-track-subreg-liveness",
- cl::desc("Enable subregister liveness tracking for PPC"),
- cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
EnableMachinePipeliner("ppc-enable-pipeliner",
cl::desc("Enable Machine Pipeliner for PPC"),
cl::init(false), cl::Hidden);
@@ -186,9 +181,7 @@ bool PPCSubtarget::useAA() const {
return true;
}
-bool PPCSubtarget::enableSubRegLiveness() const {
- return UseSubRegLiveness;
-}
+bool PPCSubtarget::enableSubRegLiveness() const { return true; }
bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
if (isAIXABI()) {
diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 0d8c71f9f2e6..69e046972f3d 100644
--- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -368,8 +368,8 @@ public:
AU.addPreserved<LiveIntervals>();
AU.addRequired<SlotIndexes>();
AU.addPreserved<SlotIndexes>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -379,7 +379,7 @@ INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
"PowerPC VSX FMA Mutation", false, false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
"PowerPC VSX FMA Mutation", false, false)
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 5906a2cdb3bf..8ac1cdf0a7a9 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -2155,6 +2155,16 @@ bool RISCVAsmParser::parseVTypeToken(const AsmToken &Tok, VTypeState &State,
break;
if (!RISCVVType::isValidLMUL(Lmul, Fractional))
break;
+
+ if (Fractional) {
+ unsigned ELEN = STI->hasFeature(RISCV::FeatureStdExtZve64x) ? 64 : 32;
+ unsigned MinLMUL = ELEN / 8;
+ if (Lmul > MinLMUL)
+ Warning(Tok.getLoc(),
+ "use of vtype encodings with LMUL < SEWMIN/ELEN == mf" +
+ Twine(MinLMUL) + " is reserved");
+ }
+
State = VTypeState_TailPolicy;
return false;
}
@@ -2194,6 +2204,7 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
bool MaskAgnostic = false;
VTypeState State = VTypeState_SEW;
+ SMLoc SEWLoc = S;
if (parseVTypeToken(getTok(), State, Sew, Lmul, Fractional, TailAgnostic,
MaskAgnostic))
@@ -2211,6 +2222,16 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
if (getLexer().is(AsmToken::EndOfStatement) && State == VTypeState_Done) {
RISCVII::VLMUL VLMUL = RISCVVType::encodeLMUL(Lmul, Fractional);
+ if (Fractional) {
+ unsigned ELEN = STI->hasFeature(RISCV::FeatureStdExtZve64x) ? 64 : 32;
+ unsigned MaxSEW = ELEN / Lmul;
+ // If MaxSEW < 8, we should have printed warning about reserved LMUL.
+ if (MaxSEW >= 8 && Sew > MaxSEW)
+ Warning(SEWLoc,
+ "use of vtype encodings with SEW > " + Twine(MaxSEW) +
+ " and LMUL == mf" + Twine(Lmul) +
+ " may not be compatible with all RVV implementations");
+ }
unsigned VTypeI =
RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic);
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index beee9405de02..b5f8715598f3 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -102,9 +102,14 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
- // If we're passing an f32 value into an i64, anyextend before copying.
- if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
- ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
+ // If we're passing a smaller fp value into a larger integer register,
+ // anyextend before copying.
+ if ((VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) ||
+ ((VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::i64) &&
+ VA.getValVT() == MVT::f16)) {
+ LLT DstTy = LLT::scalar(VA.getLocVT().getSizeInBits());
+ ValVReg = MIRBuilder.buildAnyExt(DstTy, ValVReg).getReg(0);
+ }
Register ExtReg = extendRegister(ValVReg, VA);
MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -336,10 +341,8 @@ static bool isLegalElementTypeForRVV(Type *EltTy,
// TODO: Remove IsLowerArgs argument by adding support for vectors in lowerCall.
static bool isSupportedArgumentType(Type *T, const RISCVSubtarget &Subtarget,
bool IsLowerArgs = false) {
- // TODO: Integers larger than 2*XLen are passed indirectly which is not
- // supported yet.
if (T->isIntegerTy())
- return T->getIntegerBitWidth() <= Subtarget.getXLen() * 2;
+ return true;
if (T->isHalfTy() || T->isFloatTy() || T->isDoubleTy())
return true;
if (T->isPointerTy())
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index a091380e8ce8..f511a2010980 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -577,12 +577,14 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
const APFloat &FPimm = MI.getOperand(1).getFPImm()->getValueAPF();
APInt Imm = FPimm.bitcastToAPInt();
unsigned Size = MRI.getType(DstReg).getSizeInBits();
- if (Size == 32 || (Size == 64 && Subtarget->is64Bit())) {
+ if (Size == 16 || Size == 32 || (Size == 64 && Subtarget->is64Bit())) {
Register GPRReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
if (!materializeImm(GPRReg, Imm.getSExtValue(), MIB))
return false;
- unsigned Opcode = Size == 64 ? RISCV::FMV_D_X : RISCV::FMV_W_X;
+ unsigned Opcode = Size == 64 ? RISCV::FMV_D_X
+ : Size == 32 ? RISCV::FMV_W_X
+ : RISCV::FMV_H_X;
auto FMV = MIB.buildInstr(Opcode, {DstReg}, {GPRReg});
if (!FMV.constrainAllUses(TII, TRI, RBI))
return false;
@@ -1146,16 +1148,16 @@ bool RISCVInstructionSelector::selectSelect(MachineInstr &MI,
// Convert an FCMP predicate to one of the supported F or D instructions.
static unsigned getFCmpOpcode(CmpInst::Predicate Pred, unsigned Size) {
- assert((Size == 32 || Size == 64) && "Unsupported size");
+ assert((Size == 16 || Size == 32 || Size == 64) && "Unsupported size");
switch (Pred) {
default:
llvm_unreachable("Unsupported predicate");
case CmpInst::FCMP_OLT:
- return Size == 32 ? RISCV::FLT_S : RISCV::FLT_D;
+ return Size == 16 ? RISCV::FLT_H : Size == 32 ? RISCV::FLT_S : RISCV::FLT_D;
case CmpInst::FCMP_OLE:
- return Size == 32 ? RISCV::FLE_S : RISCV::FLE_D;
+ return Size == 16 ? RISCV::FLE_H : Size == 32 ? RISCV::FLE_S : RISCV::FLE_D;
case CmpInst::FCMP_OEQ:
- return Size == 32 ? RISCV::FEQ_S : RISCV::FEQ_D;
+ return Size == 16 ? RISCV::FEQ_H : Size == 32 ? RISCV::FEQ_S : RISCV::FEQ_D;
}
}
@@ -1207,7 +1209,7 @@ bool RISCVInstructionSelector::selectFPCompare(MachineInstr &MI,
Register RHS = CmpMI.getRHSReg();
unsigned Size = MRI.getType(LHS).getSizeInBits();
- assert((Size == 32 || Size == 64) && "Unexpected size");
+ assert((Size == 16 || Size == 32 || Size == 64) && "Unexpected size");
Register TmpReg = DstReg;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 816e62ea24ed..f033ea725003 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -35,7 +35,8 @@ static LegalityPredicate typeIsScalarFPArith(unsigned TypeIdx,
const RISCVSubtarget &ST) {
return [=, &ST](const LegalityQuery &Query) {
return Query.Types[TypeIdx].isScalar() &&
- ((ST.hasStdExtF() && Query.Types[TypeIdx].getSizeInBits() == 32) ||
+ ((ST.hasStdExtZfh() && Query.Types[TypeIdx].getSizeInBits() == 16) ||
+ (ST.hasStdExtF() && Query.Types[TypeIdx].getSizeInBits() == 32) ||
(ST.hasStdExtD() && Query.Types[TypeIdx].getSizeInBits() == 64));
};
}
@@ -305,7 +306,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
getActionDefinitionsBuilder({G_GLOBAL_VALUE, G_JUMP_TABLE, G_CONSTANT_POOL})
.legalFor({p0});
- if (ST.hasStdExtM() || ST.hasStdExtZmmul()) {
+ if (ST.hasStdExtZmmul()) {
getActionDefinitionsBuilder(G_MUL)
.legalFor({s32, sXLen})
.widenScalarToNextPow2(0)
@@ -383,15 +384,24 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
getActionDefinitionsBuilder(G_FCOPYSIGN)
.legalIf(all(typeIsScalarFPArith(0, ST), typeIsScalarFPArith(1, ST)));
+ // FIXME: Use Zfhmin.
getActionDefinitionsBuilder(G_FPTRUNC).legalIf(
[=, &ST](const LegalityQuery &Query) -> bool {
return (ST.hasStdExtD() && typeIs(0, s32)(Query) &&
+ typeIs(1, s64)(Query)) ||
+ (ST.hasStdExtZfh() && typeIs(0, s16)(Query) &&
+ typeIs(1, s32)(Query)) ||
+ (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s16)(Query) &&
typeIs(1, s64)(Query));
});
getActionDefinitionsBuilder(G_FPEXT).legalIf(
[=, &ST](const LegalityQuery &Query) -> bool {
return (ST.hasStdExtD() && typeIs(0, s64)(Query) &&
- typeIs(1, s32)(Query));
+ typeIs(1, s32)(Query)) ||
+ (ST.hasStdExtZfh() && typeIs(0, s32)(Query) &&
+ typeIs(1, s16)(Query)) ||
+ (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s64)(Query) &&
+ typeIs(1, s16)(Query));
});
getActionDefinitionsBuilder(G_FCMP)
@@ -462,13 +472,6 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
getLegacyLegalizerInfo().computeTables();
}
-static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
- if (Ty.isVector())
- return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
- Ty.getNumElements());
- return IntegerType::get(C, Ty.getSizeInBits());
-}
-
bool RISCVLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
index 9c28944abc76..8fa9dba28538 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
@@ -112,8 +112,8 @@ void RISCVPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
getSelectionDAGFallbackAnalysisUsage(AU);
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<GISelCSEAnalysisWrapperPass>();
AU.addPreserved<GISelCSEAnalysisWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -143,7 +143,8 @@ bool RISCVPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
const auto *LI = ST.getLegalizerInfo();
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
- MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree *MDT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
GISelCSEAnalysisWrapper &Wrapper =
getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp
index 9a35fffae058..6a695119be25 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp
@@ -109,8 +109,8 @@ void RISCVPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
getSelectionDAGFallbackAnalysisUsage(AU);
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<GISelCSEAnalysisWrapperPass>();
AU.addPreserved<GISelCSEAnalysisWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -142,7 +142,8 @@ bool RISCVPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
bool EnableOpt =
MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
- MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree *MDT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
/*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
F.hasMinSize());
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 686c8d89a732..d25e96525399 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -29,6 +29,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] = {
// clang-format off
{0, 32, GPRBRegBank},
{0, 64, GPRBRegBank},
+ {0, 16, FPRBRegBank},
{0, 32, FPRBRegBank},
{0, 64, FPRBRegBank},
{0, 64, VRBRegBank},
@@ -41,12 +42,13 @@ const RegisterBankInfo::PartialMapping PartMappings[] = {
enum PartialMappingIdx {
PMI_GPRB32 = 0,
PMI_GPRB64 = 1,
- PMI_FPRB32 = 2,
- PMI_FPRB64 = 3,
- PMI_VRB64 = 4,
- PMI_VRB128 = 5,
- PMI_VRB256 = 6,
- PMI_VRB512 = 7,
+ PMI_FPRB16 = 2,
+ PMI_FPRB32 = 3,
+ PMI_FPRB64 = 4,
+ PMI_VRB64 = 5,
+ PMI_VRB128 = 6,
+ PMI_VRB256 = 7,
+ PMI_VRB512 = 8,
};
const RegisterBankInfo::ValueMapping ValueMappings[] = {
@@ -60,6 +62,10 @@ const RegisterBankInfo::ValueMapping ValueMappings[] = {
{&PartMappings[PMI_GPRB64], 1},
{&PartMappings[PMI_GPRB64], 1},
{&PartMappings[PMI_GPRB64], 1},
+ // Maximum 3 FPR operands; 16 bit.
+ {&PartMappings[PMI_FPRB16], 1},
+ {&PartMappings[PMI_FPRB16], 1},
+ {&PartMappings[PMI_FPRB16], 1},
// Maximum 3 FPR operands; 32 bit.
{&PartMappings[PMI_FPRB32], 1},
{&PartMappings[PMI_FPRB32], 1},
@@ -90,12 +96,13 @@ enum ValueMappingIdx {
InvalidIdx = 0,
GPRB32Idx = 1,
GPRB64Idx = 4,
- FPRB32Idx = 7,
- FPRB64Idx = 10,
- VRB64Idx = 13,
- VRB128Idx = 16,
- VRB256Idx = 19,
- VRB512Idx = 22,
+ FPRB16Idx = 7,
+ FPRB32Idx = 10,
+ FPRB64Idx = 13,
+ VRB64Idx = 16,
+ VRB128Idx = 19,
+ VRB256Idx = 22,
+ VRB512Idx = 25,
};
} // namespace RISCV
} // namespace llvm
@@ -151,8 +158,20 @@ RISCVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
}
static const RegisterBankInfo::ValueMapping *getFPValueMapping(unsigned Size) {
- assert(Size == 32 || Size == 64);
- unsigned Idx = Size == 64 ? RISCV::FPRB64Idx : RISCV::FPRB32Idx;
+ unsigned Idx;
+ switch (Size) {
+ default:
+ llvm_unreachable("Unexpected size");
+ case 16:
+ Idx = RISCV::FPRB16Idx;
+ break;
+ case 32:
+ Idx = RISCV::FPRB32Idx;
+ break;
+ case 64:
+ Idx = RISCV::FPRB64Idx;
+ break;
+ }
return &RISCV::ValueMappings[Idx];
}
@@ -459,7 +478,6 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
LLT Ty = MRI.getType(MI.getOperand(2).getReg());
unsigned Size = Ty.getSizeInBits();
- assert((Size == 32 || Size == 64) && "Unsupported size for G_FCMP");
OpdsMapping[0] = GPRValueMapping;
OpdsMapping[2] = OpdsMapping[3] = getFPValueMapping(Size);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index ae7ce476fff2..87c5a756e025 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -159,8 +159,7 @@ void RISCVELFStreamer::emitMappingSymbol(StringRef Name) {
Symbol->setBinding(ELF::STB_LOCAL);
}
-void RISCVELFStreamer::changeSection(MCSection *Section,
- const MCExpr *Subsection) {
+void RISCVELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
// We have to keep track of the mapping symbol state of any sections we
// use. Each one should start off as EMS_None, which is provided as the
// default constructor by DenseMap::lookup.
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 212d731889f1..40c6b5ac3dcc 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -32,7 +32,7 @@ public:
std::unique_ptr<MCCodeEmitter> MCE)
: MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {}
- void changeSection(MCSection *Section, const MCExpr *Subsection) override;
+ void changeSection(MCSection *Section, uint32_t Subsection) override;
void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
void emitBytes(StringRef Data) override;
void emitFill(const MCExpr &NumBytes, uint64_t FillValue, SMLoc Loc) override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index b8e0f3a867f4..d83dadd30161 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -62,7 +62,7 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const {
uint64_t Offset = AUIPCSymbol->getOffset();
if (DF->getContents().size() == Offset) {
- DF = dyn_cast_or_null<MCDataFragment>(DF->getNextNode());
+ DF = dyn_cast_or_null<MCDataFragment>(DF->getNext());
if (!DF)
return nullptr;
Offset = 0;
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 09f496574d64..d96fafbe6080 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -51,6 +51,7 @@ include "RISCVSchedSiFive7.td"
include "RISCVSchedSiFiveP400.td"
include "RISCVSchedSiFiveP600.td"
include "RISCVSchedSyntacoreSCR1.td"
+include "RISCVSchedSyntacoreSCR3.td"
include "RISCVSchedXiangShanNanHu.td"
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 9bf06850483d..a5e34def81c8 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -171,23 +171,21 @@ def NoHasStdExtZicfiss : Predicate<"!Subtarget->hasStdExtZicfiss()">;
// Multiply Extensions
+def FeatureStdExtZmmul
+ : RISCVExtension<"zmmul", 1, 0,
+ "'Zmmul' (Integer Multiplication)">;
+def HasStdExtZmmul : Predicate<"Subtarget->hasStdExtZmmul()">,
+ AssemblerPredicate<(all_of FeatureStdExtZmmul),
+ "'Zmmul' (Integer Multiplication)">;
+
def FeatureStdExtM
: RISCVExtension<"m", 2, 0,
- "'M' (Integer Multiplication and Division)">;
+ "'M' (Integer Multiplication and Division)",
+ [FeatureStdExtZmmul]>;
def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
AssemblerPredicate<(all_of FeatureStdExtM),
"'M' (Integer Multiplication and Division)">;
-def FeatureStdExtZmmul
- : RISCVExtension<"zmmul", 1, 0,
- "'Zmmul' (Integer Multiplication)">;
-
-def HasStdExtMOrZmmul
- : Predicate<"Subtarget->hasStdExtM() || Subtarget->hasStdExtZmmul()">,
- AssemblerPredicate<(any_of FeatureStdExtM, FeatureStdExtZmmul),
- "'M' (Integer Multiplication and Division) or "
- "'Zmmul' (Integer Multiplication)">;
-
// Atomic Extensions
def FeatureStdExtA
@@ -477,6 +475,14 @@ def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
// Bitmanip Extensions for Cryptography Extensions
+def FeatureStdExtB
+ : RISCVExtension<"b", 1, 0,
+ "'B' (the collection of the Zba, Zbb, Zbs extensions)",
+ [FeatureStdExtZba, FeatureStdExtZbb, FeatureStdExtZbs]>;
+def HasStdExtB : Predicate<"Subtarget->hasStdExtB()">,
+ AssemblerPredicate<(all_of FeatureStdExtB),
+ "'B' (the collection of the Zba, Zbb, Zbs extensions)">;
+
def FeatureStdExtZbkb
: RISCVExtension<"zbkb", 1, 0,
"'Zbkb' (Bitmanip instructions for Cryptography)">;
@@ -847,10 +853,24 @@ def FeatureStdExtSsaia
"'Ssaia' (Advanced Interrupt Architecture Supervisor "
"Level)">;
+def FeatureStdExtSmcsrind
+ : RISCVExtension<"smcsrind", 1, 0,
+ "'Smcsrind' (Indirect CSR Access Machine Level)">;
+def FeatureStdExtSscsrind
+ : RISCVExtension<"sscsrind", 1, 0,
+ "'Sscsrind' (Indirect CSR Access Supervisor Level)">;
+
def FeatureStdExtSmepmp
: RISCVExtension<"smepmp", 1, 0,
"'Smepmp' (Enhanced Physical Memory Protection)">;
+def FeatureStdExtSmcdeleg
+ : RISCVExtension<"smcdeleg", 1, 0,
+ "'Smcdeleg' (Counter Delegation Machine Level)">;
+def FeatureStdExtSsccfg
+ : RISCVExtension<"ssccfg", 1, 0,
+ "'Ssccfg' (Counter Configuration Supervisor Level)">;
+
def FeatureStdExtSsccptr
: RISCVExtension<"ssccptr", 1, 0,
"'Ssccptr' (Main memory supports page table reads)">;
@@ -883,7 +903,7 @@ def FeatureStdExtSstc
: RISCVExtension<"sstc", 1, 0,
"'Sstc' (Supervisor-mode timer interrupts)">;
-def FeaturesSsqosid
+def FeaturesStdExtSsqosid
: RISCVExperimentalExtension<"ssqosid", 1, 0,
"'Ssqosid' (Quality-of-Service (QoS) Identifiers)">;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 436bd4a38a31..e676c2f94583 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -878,9 +878,9 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
StackID == TargetStackID::ScalableVector) &&
"Unexpected stack ID for the frame object.");
if (StackID == TargetStackID::Default) {
- Offset =
- StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea() +
- MFI.getOffsetAdjustment());
+ assert(getOffsetOfLocalArea() == 0 && "LocalAreaOffset is not 0!");
+ Offset = StackOffset::getFixed(MFI.getObjectOffset(FI) +
+ MFI.getOffsetAdjustment());
} else if (StackID == TargetStackID::ScalableVector) {
Offset = StackOffset::getScalable(MFI.getObjectOffset(FI));
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index af3950773e4d..a648ee2c9571 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -300,7 +300,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setLibcallName(RTLIB::MULO_I64, nullptr);
}
- if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul()) {
+ if (!Subtarget.hasStdExtZmmul()) {
setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU}, XLenVT, Expand);
if (RV64LegalI32 && Subtarget.is64Bit())
setOperationAction(ISD::MUL, MVT::i32, Promote);
@@ -662,6 +662,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setBooleanContents(ZeroOrOneBooleanContent);
+ if (getTargetMachine().getTargetTriple().isOSLinux()) {
+ // Custom lowering of llvm.clear_cache.
+ setOperationAction(ISD::CLEAR_CACHE, MVT::Other, Custom);
+ }
+
if (Subtarget.hasVInstructions()) {
setBooleanVectorContents(ZeroOrOneBooleanContent);
@@ -1102,12 +1107,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::EXTRACT_SUBVECTOR},
VT, Custom);
setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
- if (Subtarget.hasStdExtZfbfmin()) {
- if (Subtarget.hasVInstructionsF16())
- setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
- else if (Subtarget.hasVInstructionsF16Minimal())
- setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
- }
+ if (Subtarget.hasStdExtZfbfmin())
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -1340,12 +1341,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::EXTRACT_SUBVECTOR},
VT, Custom);
setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
- if (Subtarget.hasStdExtZfbfmin()) {
- if (Subtarget.hasVInstructionsF16())
- setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
- else if (Subtarget.hasVInstructionsF16Minimal())
- setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
- }
+ if (Subtarget.hasStdExtZfbfmin())
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(
{ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
Custom);
@@ -6738,9 +6735,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
Subtarget.hasStdExtZfhminOrZhinxmin() &&
!Subtarget.hasVInstructionsF16())) ||
(Op.getValueType().getScalarType() == MVT::bf16 &&
- (Subtarget.hasVInstructionsBF16() && Subtarget.hasStdExtZfbfmin() &&
- Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))) {
+ (Subtarget.hasVInstructionsBF16() && Subtarget.hasStdExtZfbfmin()))) {
if (Op.getValueType() == MVT::nxv32f16 ||
Op.getValueType() == MVT::nxv32bf16)
return SplitVectorOp(Op, DAG);
@@ -7152,7 +7147,27 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerVPSpliceExperimental(Op, DAG);
case ISD::EXPERIMENTAL_VP_REVERSE:
return lowerVPReverseExperimental(Op, DAG);
+ case ISD::CLEAR_CACHE: {
+ assert(getTargetMachine().getTargetTriple().isOSLinux() &&
+ "llvm.clear_cache only needs custom lower on Linux targets");
+ SDLoc DL(Op);
+ SDValue Flags = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+ return emitFlushICache(DAG, Op.getOperand(0), Op.getOperand(1),
+ Op.getOperand(2), Flags, DL);
}
+ }
+}
+
+SDValue RISCVTargetLowering::emitFlushICache(SelectionDAG &DAG, SDValue InChain,
+ SDValue Start, SDValue End,
+ SDValue Flags, SDLoc DL) const {
+ MakeLibCallOptions CallOptions;
+ std::pair<SDValue, SDValue> CallResult =
+ makeLibCall(DAG, RTLIB::RISCV_FLUSH_ICACHE, MVT::isVoid,
+ {Start, End, Flags}, CallOptions, DL, InChain);
+
+ // This function returns void so only the out chain matters.
+ return CallResult.second;
}
static SDValue getTargetNode(GlobalAddressSDNode *N, const SDLoc &DL, EVT Ty,
@@ -12487,12 +12502,15 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
}
break;
}
- case RISCVISD::BREV8: {
+ case RISCVISD::BREV8:
+ case RISCVISD::ORC_B: {
MVT VT = N->getSimpleValueType(0);
MVT XLenVT = Subtarget.getXLenVT();
assert((VT == MVT::i16 || (VT == MVT::i32 && Subtarget.is64Bit())) &&
"Unexpected custom legalisation");
- assert(Subtarget.hasStdExtZbkb() && "Unexpected extension");
+ assert(((N->getOpcode() == RISCVISD::BREV8 && Subtarget.hasStdExtZbkb()) ||
+ (N->getOpcode() == RISCVISD::ORC_B && Subtarget.hasStdExtZbb())) &&
+ "Unexpected extension");
SDValue NewOp = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, N->getOperand(0));
SDValue NewRes = DAG.getNode(N->getOpcode(), DL, XLenVT, NewOp);
// ReplaceNodeResults requires we maintain the same type for the return
@@ -13330,6 +13348,35 @@ static SDValue combineSubOfBoolean(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::ADD, DL, VT, NewLHS, NewRHS);
}
+// Looks for (sub (shl X, 8), X) where only bits 8, 16, 24, 32, etc. of X are
+// non-zero. Replace with orc.b.
+static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ if (!Subtarget.hasStdExtZbb())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ if (VT != Subtarget.getXLenVT() && VT != MVT::i32 && VT != MVT::i16)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (N0.getOpcode() != ISD::SHL || N0.getOperand(0) != N1 || !N0.hasOneUse())
+ return SDValue();
+
+ auto *ShAmtC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!ShAmtC || ShAmtC->getZExtValue() != 8)
+ return SDValue();
+
+ APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0xfe));
+ if (!DAG.MaskedValueIsZero(N1, Mask))
+ return SDValue();
+
+ return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, N1);
+}
+
static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
if (SDValue V = combineSubOfBoolean(N, DAG))
@@ -13352,6 +13399,8 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineBinOpOfZExt(N, DAG))
return V;
+ if (SDValue V = combineSubShiftToOrcB(N, DAG, Subtarget))
+ return V;
// fold (sub x, (select lhs, rhs, cc, 0, y)) ->
// (select lhs, rhs, cc, x, (sub x, y))
@@ -13691,8 +13740,8 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
if (VT != Subtarget.getXLenVT())
return SDValue();
- if (!Subtarget.hasStdExtZba() && !Subtarget.hasVendorXTHeadBa())
- return SDValue();
+ const bool HasShlAdd =
+ Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa();
ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!CNode)
@@ -13705,107 +13754,123 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
// other target properly freezes X in these cases either.
SDValue X = N->getOperand(0);
- for (uint64_t Divisor : {3, 5, 9}) {
- if (MulAmt % Divisor != 0)
- continue;
- uint64_t MulAmt2 = MulAmt / Divisor;
- // 3/5/9 * 2^N -> shl (shXadd X, X), N
- if (isPowerOf2_64(MulAmt2)) {
- SDLoc DL(N);
- SDValue X = N->getOperand(0);
- // Put the shift first if we can fold a zext into the
- // shift forming a slli.uw.
- if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
- X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) {
- SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
- DAG.getConstant(Log2_64(MulAmt2), DL, VT));
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT), Shl);
+ if (HasShlAdd) {
+ for (uint64_t Divisor : {3, 5, 9}) {
+ if (MulAmt % Divisor != 0)
+ continue;
+ uint64_t MulAmt2 = MulAmt / Divisor;
+ // 3/5/9 * 2^N -> shl (shXadd X, X), N
+ if (isPowerOf2_64(MulAmt2)) {
+ SDLoc DL(N);
+ SDValue X = N->getOperand(0);
+ // Put the shift first if we can fold a zext into the
+ // shift forming a slli.uw.
+ if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
+ X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) {
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
+ DAG.getConstant(Log2_64(MulAmt2), DL, VT));
+ return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
+ DAG.getConstant(Log2_64(Divisor - 1), DL, VT),
+ Shl);
+ }
+ // Otherwise, put rhe shl second so that it can fold with following
+ // instructions (e.g. sext or add).
+ SDValue Mul359 =
+ DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
+ return DAG.getNode(ISD::SHL, DL, VT, Mul359,
+ DAG.getConstant(Log2_64(MulAmt2), DL, VT));
+ }
+
+ // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
+ if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) {
+ SDLoc DL(N);
+ SDValue Mul359 =
+ DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
+ return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
+ DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT),
+ Mul359);
}
- // Otherwise, put rhe shl second so that it can fold with following
- // instructions (e.g. sext or add).
- SDValue Mul359 =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
- return DAG.getNode(ISD::SHL, DL, VT, Mul359,
- DAG.getConstant(Log2_64(MulAmt2), DL, VT));
}
- // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
- if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) {
- SDLoc DL(N);
- SDValue Mul359 =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT),
- Mul359);
- }
- }
-
- // If this is a power 2 + 2/4/8, we can use a shift followed by a single
- // shXadd. First check if this a sum of two power of 2s because that's
- // easy. Then count how many zeros are up to the first bit.
- if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
- unsigned ScaleShift = llvm::countr_zero(MulAmt);
- if (ScaleShift >= 1 && ScaleShift < 4) {
- unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
- SDLoc DL(N);
- SDValue Shift1 =
- DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(ScaleShift, DL, VT), Shift1);
+ // If this is a power 2 + 2/4/8, we can use a shift followed by a single
+ // shXadd. First check if this a sum of two power of 2s because that's
+ // easy. Then count how many zeros are up to the first bit.
+ if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+ unsigned ScaleShift = llvm::countr_zero(MulAmt);
+ if (ScaleShift >= 1 && ScaleShift < 4) {
+ unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
+ SDLoc DL(N);
+ SDValue Shift1 =
+ DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
+ return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(ScaleShift, DL, VT), Shift1);
+ }
}
- }
- // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
- // This is the two instruction form, there are also three instruction
- // variants we could implement. e.g.
- // (2^(1,2,3) * 3,5,9 + 1) << C2
- // 2^(C1>3) * 3,5,9 +/- 1
- for (uint64_t Divisor : {3, 5, 9}) {
- uint64_t C = MulAmt - 1;
- if (C <= Divisor)
- continue;
- unsigned TZ = llvm::countr_zero(C);
- if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) {
- SDLoc DL(N);
- SDValue Mul359 =
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getConstant(TZ, DL, VT), X);
+ // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
+ // This is the two instruction form, there are also three instruction
+ // variants we could implement. e.g.
+ // (2^(1,2,3) * 3,5,9 + 1) << C2
+ // 2^(C1>3) * 3,5,9 +/- 1
+ for (uint64_t Divisor : {3, 5, 9}) {
+ uint64_t C = MulAmt - 1;
+ if (C <= Divisor)
+ continue;
+ unsigned TZ = llvm::countr_zero(C);
+ if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) {
+ SDLoc DL(N);
+ SDValue Mul359 =
+ DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
+ return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
+ DAG.getConstant(TZ, DL, VT), X);
+ }
}
- }
- // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
- if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
- unsigned ScaleShift = llvm::countr_zero(MulAmt - 1);
- if (ScaleShift >= 1 && ScaleShift < 4) {
- unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2)));
- SDLoc DL(N);
- SDValue Shift1 =
- DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
- return DAG.getNode(ISD::ADD, DL, VT, Shift1,
- DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(ScaleShift, DL, VT), X));
+ // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
+ if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
+ unsigned ScaleShift = llvm::countr_zero(MulAmt - 1);
+ if (ScaleShift >= 1 && ScaleShift < 4) {
+ unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2)));
+ SDLoc DL(N);
+ SDValue Shift1 =
+ DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
+ return DAG.getNode(ISD::ADD, DL, VT, Shift1,
+ DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(ScaleShift, DL, VT), X));
+ }
}
- }
- // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x))
- for (uint64_t Offset : {3, 5, 9}) {
- if (isPowerOf2_64(MulAmt + Offset)) {
- SDLoc DL(N);
- SDValue Shift1 =
- DAG.getNode(ISD::SHL, DL, VT, X,
- DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT));
- SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(Log2_64(Offset - 1), DL, VT),
- X);
- return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
+ // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x))
+ for (uint64_t Offset : {3, 5, 9}) {
+ if (isPowerOf2_64(MulAmt + Offset)) {
+ SDLoc DL(N);
+ SDValue Shift1 =
+ DAG.getNode(ISD::SHL, DL, VT, X,
+ DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT));
+ SDValue Mul359 =
+ DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+ DAG.getConstant(Log2_64(Offset - 1), DL, VT), X);
+ return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
+ }
}
}
+ // 2^N - 2^M -> (sub (shl X, C1), (shl X, C2))
+ uint64_t MulAmtLowBit = MulAmt & (-MulAmt);
+ if (isPowerOf2_64(MulAmt + MulAmtLowBit)) {
+ uint64_t ShiftAmt1 = MulAmt + MulAmtLowBit;
+ SDLoc DL(N);
+ SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(ShiftAmt1), DL, VT));
+ SDValue Shift2 =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmtLowBit), DL, VT));
+ return DAG.getNode(ISD::SUB, DL, VT, Shift1, Shift2);
+ }
+
return SDValue();
}
@@ -16829,7 +16894,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAG.getNode(ISD::FNEG, DL, VT, NewFPExtRound));
}
case ISD::MGATHER: {
- const auto *MGN = dyn_cast<MaskedGatherSDNode>(N);
+ const auto *MGN = cast<MaskedGatherSDNode>(N);
const EVT VT = N->getValueType(0);
SDValue Index = MGN->getIndex();
SDValue ScaleOp = MGN->getScale();
@@ -16929,7 +16994,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case ISD::MSCATTER:{
- const auto *MSN = dyn_cast<MaskedScatterSDNode>(N);
+ const auto *MSN = cast<MaskedScatterSDNode>(N);
SDValue Index = MSN->getIndex();
SDValue ScaleOp = MSN->getScale();
ISD::MemIndexType IndexType = MSN->getIndexType();
@@ -16965,7 +17030,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case ISD::VP_GATHER: {
- const auto *VPGN = dyn_cast<VPGatherSDNode>(N);
+ const auto *VPGN = cast<VPGatherSDNode>(N);
SDValue Index = VPGN->getIndex();
SDValue ScaleOp = VPGN->getScale();
ISD::MemIndexType IndexType = VPGN->getIndexType();
@@ -16990,7 +17055,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case ISD::VP_SCATTER: {
- const auto *VPSN = dyn_cast<VPScatterSDNode>(N);
+ const auto *VPSN = cast<VPScatterSDNode>(N);
SDValue Index = VPSN->getIndex();
SDValue ScaleOp = VPSN->getScale();
ISD::MemIndexType IndexType = VPSN->getIndexType();
@@ -21113,14 +21178,13 @@ bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned)
bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const {
// Check integral scalar types.
- const bool HasExtMOrZmmul =
- Subtarget.hasStdExtM() || Subtarget.hasStdExtZmmul();
+ const bool HasZmmul = Subtarget.hasStdExtZmmul();
if (!VT.isScalarInteger())
return false;
// Omit the optimization if the sub target has the M extension and the data
// size exceeds XLen.
- if (HasExtMOrZmmul && VT.getSizeInBits() > Subtarget.getXLen())
+ if (HasZmmul && VT.getSizeInBits() > Subtarget.getXLen())
return false;
if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
@@ -21755,7 +21819,7 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
Op == Instruction::And || Op == Instruction::Or ||
Op == Instruction::Xor || Op == Instruction::InsertElement ||
Op == Instruction::ShuffleVector || Op == Instruction::Load ||
- Op == Instruction::Freeze)
+ Op == Instruction::Freeze || Op == Instruction::Store)
return false;
if (Inst.getType()->isScalableTy())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3b8eb3c88901..7d8bceb5cb34 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -1037,6 +1037,9 @@ private:
const APInt &AndMask) const override;
unsigned getMinimumJumpTableEntries() const override;
+
+ SDValue emitFlushICache(SelectionDAG &DAG, SDValue InChain, SDValue Start,
+ SDValue End, SDValue Flags, SDLoc DL) const;
};
/// As per the spec, the rules for passing vector arguments are as follows:
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 82358cdd45ed..101f188374e0 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -48,10 +48,13 @@ static cl::opt<bool> DisableInsertVSETVLPHIOpt(
namespace {
/// Given a virtual register \p Reg, return the corresponding VNInfo for it.
-/// This will return nullptr if the virtual register is an implicit_def.
+/// This will return nullptr if the virtual register is an implicit_def or
+/// if LiveIntervals is not available.
static VNInfo *getVNInfoFromReg(Register Reg, const MachineInstr &MI,
const LiveIntervals *LIS) {
assert(Reg.isVirtual());
+ if (!LIS)
+ return nullptr;
auto &LI = LIS->getInterval(Reg);
SlotIndex SI = LIS->getSlotIndexes()->getInstructionIndex(MI);
return LI.getVNInfoBefore(SI);
@@ -249,6 +252,13 @@ struct DemandedFields {
VLZeroness = true;
}
+ static DemandedFields all() {
+ DemandedFields DF;
+ DF.demandVTYPE();
+ DF.demandVL();
+ return DF;
+ }
+
// Make this the result of demanding both the fields in this and B.
void doUnion(const DemandedFields &B) {
VLAny |= B.VLAny;
@@ -397,7 +407,9 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
if (RISCVII::hasSEWOp(TSFlags)) {
Res.demandVTYPE();
if (RISCVII::hasVLOp(TSFlags))
- Res.demandVL();
+ if (const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
+ !VLOp.isReg() || !VLOp.isUndef())
+ Res.demandVL();
// Behavior is independent of mask policy.
if (!RISCVII::usesMaskPolicy(TSFlags))
@@ -503,7 +515,8 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
/// values of the VL and VTYPE registers after insertion.
class VSETVLIInfo {
struct AVLDef {
- // Every AVLDef should have a VNInfo.
+ // Every AVLDef should have a VNInfo, unless we're running without
+ // LiveIntervals in which case this will be nullptr.
const VNInfo *ValNo;
Register DefReg;
};
@@ -517,8 +530,7 @@ class VSETVLIInfo {
AVLIsReg,
AVLIsImm,
AVLIsVLMAX,
- AVLIsIgnored,
- Unknown,
+ Unknown, // AVL and VTYPE are fully unknown
} State = Uninitialized;
// Fields from VTYPE.
@@ -544,7 +556,7 @@ public:
bool isUnknown() const { return State == Unknown; }
void setAVLRegDef(const VNInfo *VNInfo, Register AVLReg) {
- assert(VNInfo && AVLReg.isVirtual());
+ assert(AVLReg.isVirtual());
AVLRegDef.ValNo = VNInfo;
AVLRegDef.DefReg = AVLReg;
State = AVLIsReg;
@@ -557,12 +569,9 @@ public:
void setAVLVLMAX() { State = AVLIsVLMAX; }
- void setAVLIgnored() { State = AVLIsIgnored; }
-
bool hasAVLImm() const { return State == AVLIsImm; }
bool hasAVLReg() const { return State == AVLIsReg; }
bool hasAVLVLMAX() const { return State == AVLIsVLMAX; }
- bool hasAVLIgnored() const { return State == AVLIsIgnored; }
Register getAVLReg() const {
assert(hasAVLReg() && AVLRegDef.DefReg.isVirtual());
return AVLRegDef.DefReg;
@@ -577,9 +586,11 @@ public:
}
// Most AVLIsReg infos will have a single defining MachineInstr, unless it was
// a PHI node. In that case getAVLVNInfo()->def will point to the block
- // boundary slot.
+ // boundary slot. If LiveIntervals isn't available, then nullptr is returned.
const MachineInstr *getAVLDefMI(const LiveIntervals *LIS) const {
assert(hasAVLReg());
+ if (!LIS)
+ return nullptr;
auto *MI = LIS->getInstructionFromIndex(getAVLVNInfo()->def);
assert(!(getAVLVNInfo()->isPHIDef() && MI));
return MI;
@@ -593,8 +604,6 @@ public:
setAVLRegDef(Info.getAVLVNInfo(), Info.getAVLReg());
else if (Info.hasAVLVLMAX())
setAVLVLMAX();
- else if (Info.hasAVLIgnored())
- setAVLIgnored();
else {
assert(Info.hasAVLImm());
setAVLImm(Info.getAVLImm());
@@ -615,8 +624,6 @@ public:
}
if (hasAVLVLMAX())
return true;
- if (hasAVLIgnored())
- return false;
return false;
}
@@ -627,10 +634,15 @@ public:
return (hasNonZeroAVL(LIS) && Other.hasNonZeroAVL(LIS));
}
- bool hasSameAVL(const VSETVLIInfo &Other) const {
- if (hasAVLReg() && Other.hasAVLReg())
+ bool hasSameAVLLatticeValue(const VSETVLIInfo &Other) const {
+ if (hasAVLReg() && Other.hasAVLReg()) {
+ assert(!getAVLVNInfo() == !Other.getAVLVNInfo() &&
+ "we either have intervals or we don't");
+ if (!getAVLVNInfo())
+ return getAVLReg() == Other.getAVLReg();
return getAVLVNInfo()->id == Other.getAVLVNInfo()->id &&
getAVLReg() == Other.getAVLReg();
+ }
if (hasAVLImm() && Other.hasAVLImm())
return getAVLImm() == Other.getAVLImm();
@@ -638,12 +650,24 @@ public:
if (hasAVLVLMAX())
return Other.hasAVLVLMAX() && hasSameVLMAX(Other);
- if (hasAVLIgnored())
- return Other.hasAVLIgnored();
-
return false;
}
+ // Return true if the two lattice values are guaranteed to have
+ // the same AVL value at runtime.
+ bool hasSameAVL(const VSETVLIInfo &Other) const {
+ // Without LiveIntervals, we don't know which instruction defines a
+ // register. Since a register may be redefined, this means all AVLIsReg
+ // states must be treated as possibly distinct.
+ if (hasAVLReg() && Other.hasAVLReg()) {
+ assert(!getAVLVNInfo() == !Other.getAVLVNInfo() &&
+ "we either have intervals or we don't");
+ if (!getAVLVNInfo())
+ return false;
+ }
+ return hasSameAVLLatticeValue(Other);
+ }
+
void setVTYPE(unsigned VType) {
assert(isValid() && !isUnknown() &&
"Can't set VTYPE for uninitialized or unknown");
@@ -713,14 +737,12 @@ public:
const LiveIntervals *LIS) const {
assert(isValid() && Require.isValid() &&
"Can't compare invalid VSETVLIInfos");
- assert(!Require.SEWLMULRatioOnly &&
- "Expected a valid VTYPE for instruction!");
// Nothing is compatible with Unknown.
if (isUnknown() || Require.isUnknown())
return false;
// If only our VLMAX ratio is valid, then this isn't compatible.
- if (SEWLMULRatioOnly)
+ if (SEWLMULRatioOnly || Require.SEWLMULRatioOnly)
return false;
if (Used.VLAny && !(hasSameAVL(Require) && hasSameVLMAX(Require)))
@@ -745,7 +767,7 @@ public:
if (Other.isUnknown())
return isUnknown();
- if (!hasSameAVL(Other))
+ if (!hasSameAVLLatticeValue(Other))
return false;
// If the SEWLMULRatioOnly bits are different, then they aren't equal.
@@ -811,13 +833,11 @@ public:
if (isUnknown())
OS << "unknown";
if (hasAVLReg())
- OS << "AVLReg=" << (unsigned)getAVLReg();
+ OS << "AVLReg=" << llvm::printReg(getAVLReg());
if (hasAVLImm())
OS << "AVLImm=" << (unsigned)AVLImm;
if (hasAVLVLMAX())
OS << "AVLVLMAX";
- if (hasAVLIgnored())
- OS << "AVLIgnored";
OS << ", "
<< "VLMul=" << (unsigned)VLMul << ", "
<< "SEW=" << (unsigned)SEW << ", "
@@ -855,6 +875,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
const RISCVSubtarget *ST;
const TargetInstrInfo *TII;
MachineRegisterInfo *MRI;
+ // Possibly null!
LiveIntervals *LIS;
std::vector<BlockData> BlockInfo;
@@ -869,9 +890,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<LiveIntervals>();
+ AU.addUsedIfAvailable<LiveIntervals>();
AU.addPreserved<LiveIntervals>();
- AU.addRequired<SlotIndexes>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveDebugVariables>();
AU.addPreserved<LiveStacks>();
@@ -933,7 +953,8 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
if (AVLReg == RISCV::X0)
NewInfo.setAVLVLMAX();
else if (MI.getOperand(1).isUndef())
- NewInfo.setAVLIgnored();
+ // Otherwise use an AVL of 1 to avoid depending on previous vl.
+ NewInfo.setAVLImm(1);
else {
VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS);
NewInfo.setAVLRegDef(VNI, AVLReg);
@@ -941,6 +962,17 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
}
NewInfo.setVTYPE(MI.getOperand(2).getImm());
+ // If AVL is defined by a vsetvli with the same VLMAX, we can replace the
+ // AVL operand with the AVL of the defining vsetvli.
+ if (NewInfo.hasAVLReg()) {
+ if (const MachineInstr *DefMI = NewInfo.getAVLDefMI(LIS);
+ DefMI && isVectorConfigInstr(*DefMI)) {
+ VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI);
+ if (DefInstrInfo.hasSameVLMAX(NewInfo))
+ NewInfo.setAVL(DefInstrInfo);
+ }
+ }
+
return NewInfo;
}
@@ -1009,17 +1041,17 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
else
InstrInfo.setAVLImm(Imm);
} else if (VLOp.isUndef()) {
- InstrInfo.setAVLIgnored();
+ // Otherwise use an AVL of 1 to avoid depending on previous vl.
+ InstrInfo.setAVLImm(1);
} else {
VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS);
InstrInfo.setAVLRegDef(VNI, VLOp.getReg());
}
} else {
assert(isScalarExtractInstr(MI));
- // TODO: If we are more clever about x0,x0 insertion then we should be able
- // to deduce that the VL is ignored based off of DemandedFields, and remove
- // the AVLIsIgnored state. Then we can just use an arbitrary immediate AVL.
- InstrInfo.setAVLIgnored();
+ // Pick a random value for state tracking purposes, will be ignored via
+ // the demanded fields mechanism
+ InstrInfo.setAVLImm(1);
}
#ifndef NDEBUG
if (std::optional<unsigned> EEW = getEEWForLoadStore(MI)) {
@@ -1029,15 +1061,12 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
// If AVL is defined by a vsetvli with the same VLMAX, we can replace the
- // AVL operand with the AVL of the defining vsetvli. We avoid general
- // register AVLs to avoid extending live ranges without being sure we can
- // kill the original source reg entirely.
+ // AVL operand with the AVL of the defining vsetvli.
if (InstrInfo.hasAVLReg()) {
if (const MachineInstr *DefMI = InstrInfo.getAVLDefMI(LIS);
DefMI && isVectorConfigInstr(*DefMI)) {
VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI);
- if (DefInstrInfo.hasSameVLMAX(InstrInfo) &&
- (DefInstrInfo.hasAVLImm() || DefInstrInfo.hasAVLVLMAX()))
+ if (DefInstrInfo.hasSameVLMAX(InstrInfo))
InstrInfo.setAVL(DefInstrInfo);
}
}
@@ -1066,7 +1095,8 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(RISCV::X0, RegState::Kill)
.addImm(Info.encodeVTYPE())
.addReg(RISCV::VL, RegState::Implicit);
- LIS->InsertMachineInstrInMaps(*MI);
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*MI);
return;
}
@@ -1083,7 +1113,8 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(RISCV::X0, RegState::Kill)
.addImm(Info.encodeVTYPE())
.addReg(RISCV::VL, RegState::Implicit);
- LIS->InsertMachineInstrInMaps(*MI);
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*MI);
return;
}
}
@@ -1095,29 +1126,8 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(RISCV::X0, RegState::Define | RegState::Dead)
.addImm(Info.getAVLImm())
.addImm(Info.encodeVTYPE());
- LIS->InsertMachineInstrInMaps(*MI);
- return;
- }
-
- if (Info.hasAVLIgnored()) {
- // We can only use x0, x0 if there's no chance of the vtype change causing
- // the previous vl to become invalid.
- if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
- Info.hasSameVLMAX(PrevInfo)) {
- auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addReg(RISCV::X0, RegState::Kill)
- .addImm(Info.encodeVTYPE())
- .addReg(RISCV::VL, RegState::Implicit);
+ if (LIS)
LIS->InsertMachineInstrInMaps(*MI);
- return;
- }
- // Otherwise use an AVL of 1 to avoid depending on previous vl.
- auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI))
- .addReg(RISCV::X0, RegState::Define | RegState::Dead)
- .addImm(1)
- .addImm(Info.encodeVTYPE());
- LIS->InsertMachineInstrInMaps(*MI);
return;
}
@@ -1127,8 +1137,10 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(DestReg, RegState::Define | RegState::Dead)
.addReg(RISCV::X0, RegState::Kill)
.addImm(Info.encodeVTYPE());
- LIS->InsertMachineInstrInMaps(*MI);
- LIS->createAndComputeVirtRegInterval(DestReg);
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*MI);
+ LIS->createAndComputeVirtRegInterval(DestReg);
+ }
return;
}
@@ -1138,12 +1150,18 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
.addReg(RISCV::X0, RegState::Define | RegState::Dead)
.addReg(AVLReg)
.addImm(Info.encodeVTYPE());
- LIS->InsertMachineInstrInMaps(*MI);
- // Normally the AVL's live range will already extend past the inserted vsetvli
- // because the pseudos below will already use the AVL. But this isn't always
- // the case, e.g. PseudoVMV_X_S doesn't have an AVL operand.
- LIS->getInterval(AVLReg).extendInBlock(
- LIS->getMBBStartIdx(&MBB), LIS->getInstructionIndex(*MI).getRegSlot());
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*MI);
+ // Normally the AVL's live range will already extend past the inserted
+ // vsetvli because the pseudos below will already use the AVL. But this
+ // isn't always the case, e.g. PseudoVMV_X_S doesn't have an AVL operand or
+ // we've taken the AVL from the VL output of another vsetvli.
+ LiveInterval &LI = LIS->getInterval(AVLReg);
+ // Need to get non-const VNInfo
+ VNInfo *VNI = LI.getValNumInfo(Info.getAVLVNInfo()->id);
+ LI.addSegment(LiveInterval::Segment(
+ VNI->def, LIS->getInstructionIndex(*MI).getRegSlot(), VNI));
+ }
}
/// Return true if a VSETVLI is required to transition from CurInfo to Require
@@ -1157,19 +1175,6 @@ bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used,
if (CurInfo.isCompatible(Used, Require, LIS))
return false;
- // We didn't find a compatible value. If our AVL is a virtual register,
- // it might be defined by a VSET(I)VLI. If it has the same VLMAX we need
- // and the last VL/VTYPE we observed is the same, we don't need a
- // VSETVLI here.
- if (Require.hasAVLReg() && CurInfo.hasCompatibleVTYPE(Used, Require)) {
- if (const MachineInstr *DefMI = Require.getAVLDefMI(LIS);
- DefMI && isVectorConfigInstr(*DefMI)) {
- VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
- if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo))
- return false;
- }
- }
-
return true;
}
@@ -1257,10 +1262,14 @@ void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
if (RISCV::isFaultFirstLoad(MI)) {
// Update AVL to vl-output of the fault first load.
assert(MI.getOperand(1).getReg().isVirtual());
- auto &LI = LIS->getInterval(MI.getOperand(1).getReg());
- SlotIndex SI = LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
- VNInfo *VNI = LI.getVNInfoAt(SI);
- Info.setAVLRegDef(VNI, MI.getOperand(1).getReg());
+ if (LIS) {
+ auto &LI = LIS->getInterval(MI.getOperand(1).getReg());
+ SlotIndex SI =
+ LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
+ VNInfo *VNI = LI.getVNInfoAt(SI);
+ Info.setAVLRegDef(VNI, MI.getOperand(1).getReg());
+ } else
+ Info.setAVLRegDef(nullptr, MI.getOperand(1).getReg());
return;
}
@@ -1354,6 +1363,9 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
if (!Require.hasAVLReg())
return true;
+ if (!LIS)
+ return true;
+
// We need the AVL to have been produced by a PHI node in this basic block.
const VNInfo *Valno = Require.getAVLVNInfo();
if (!Valno->isPHIDef() || LIS->getMBBFromIndex(Valno->def) != &MBB)
@@ -1412,7 +1424,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
uint64_t TSFlags = MI.getDesc().TSFlags;
if (RISCVII::hasSEWOp(TSFlags)) {
- if (PrevInfo != CurInfo) {
+ if (!PrevInfo.isCompatible(DemandedFields::all(), CurInfo, LIS)) {
// If this is the first implicit state change, and the state change
// requested can be proven to produce the same register contents, we
// can skip emitting the actual state change and continue as if we
@@ -1429,27 +1441,29 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
if (VLOp.isReg()) {
Register Reg = VLOp.getReg();
- LiveInterval &LI = LIS->getInterval(Reg);
// Erase the AVL operand from the instruction.
VLOp.setReg(RISCV::NoRegister);
VLOp.setIsKill(false);
- SmallVector<MachineInstr *> DeadMIs;
- LIS->shrinkToUses(&LI, &DeadMIs);
- // We might have separate components that need split due to
- // needVSETVLIPHI causing us to skip inserting a new VL def.
- SmallVector<LiveInterval *> SplitLIs;
- LIS->splitSeparateComponents(LI, SplitLIs);
-
- // If the AVL was an immediate > 31, then it would have been emitted
- // as an ADDI. However, the ADDI might not have been used in the
- // vsetvli, or a vsetvli might not have been emitted, so it may be
- // dead now.
- for (MachineInstr *DeadMI : DeadMIs) {
- if (!TII->isAddImmediate(*DeadMI, Reg))
- continue;
- LIS->RemoveMachineInstrFromMaps(*DeadMI);
- DeadMI->eraseFromParent();
+ if (LIS) {
+ LiveInterval &LI = LIS->getInterval(Reg);
+ SmallVector<MachineInstr *> DeadMIs;
+ LIS->shrinkToUses(&LI, &DeadMIs);
+ // We might have separate components that need split due to
+ // needVSETVLIPHI causing us to skip inserting a new VL def.
+ SmallVector<LiveInterval *> SplitLIs;
+ LIS->splitSeparateComponents(LI, SplitLIs);
+
+ // If the AVL was an immediate > 31, then it would have been emitted
+ // as an ADDI. However, the ADDI might not have been used in the
+ // vsetvli, or a vsetvli might not have been emitted, so it may be
+ // dead now.
+ for (MachineInstr *DeadMI : DeadMIs) {
+ if (!TII->isAddImmediate(*DeadMI, Reg))
+ continue;
+ LIS->RemoveMachineInstrFromMaps(*DeadMI);
+ DeadMI->eraseFromParent();
+ }
}
}
MI.addOperand(MachineOperand::CreateReg(RISCV::VL, /*isDef*/ false,
@@ -1506,6 +1520,9 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
if (!UnavailablePred || !AvailableInfo.isValid())
return;
+ if (!LIS)
+ return;
+
// If we don't know the exact VTYPE, we can't copy the vsetvli to the exit of
// the unavailable pred.
if (AvailableInfo.hasSEWLMULRatioOnly())
@@ -1529,11 +1546,6 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
return;
}
- // If the AVL isn't used in its predecessors then bail, since we have no AVL
- // to insert a vsetvli with.
- if (AvailableInfo.hasAVLIgnored())
- return;
-
// Model the effect of changing the input state of the block MBB to
// AvailableInfo. We're looking for two issues here; one legality,
// one profitability.
@@ -1657,7 +1669,7 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
// The def of DefReg moved to MI, so extend the LiveInterval up to
// it.
- if (DefReg.isVirtual()) {
+ if (DefReg.isVirtual() && LIS) {
LiveInterval &DefLI = LIS->getInterval(DefReg);
SlotIndex MISlot = LIS->getInstructionIndex(MI).getRegSlot();
VNInfo *DefVNI = DefLI.getVNInfoAt(DefLI.beginIndex());
@@ -1686,13 +1698,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
if (OldVLReg && OldVLReg.isVirtual()) {
// NextMI no longer uses OldVLReg so shrink its LiveInterval.
- LIS->shrinkToUses(&LIS->getInterval(OldVLReg));
+ if (LIS)
+ LIS->shrinkToUses(&LIS->getInterval(OldVLReg));
MachineInstr *VLOpDef = MRI->getUniqueVRegDef(OldVLReg);
if (VLOpDef && TII->isAddImmediate(*VLOpDef, OldVLReg) &&
MRI->use_nodbg_empty(OldVLReg)) {
VLOpDef->eraseFromParent();
- LIS->removeInterval(OldVLReg);
+ if (LIS)
+ LIS->removeInterval(OldVLReg);
}
}
MI.setDesc(NextMI->getDesc());
@@ -1708,7 +1722,8 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
NumCoalescedVSETVL += ToDelete.size();
for (auto *MI : ToDelete) {
- LIS->RemoveMachineInstrFromMaps(*MI);
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(*MI);
MI->eraseFromParent();
}
}
@@ -1723,12 +1738,14 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
auto ReadVLMI = BuildMI(MBB, I, MI.getDebugLoc(),
TII->get(RISCV::PseudoReadVL), VLOutput);
// Move the LiveInterval's definition down to PseudoReadVL.
- SlotIndex NewDefSI =
- LIS->InsertMachineInstrInMaps(*ReadVLMI).getRegSlot();
- LiveInterval &DefLI = LIS->getInterval(VLOutput);
- VNInfo *DefVNI = DefLI.getVNInfoAt(DefLI.beginIndex());
- DefLI.removeSegment(DefLI.beginIndex(), NewDefSI);
- DefVNI->def = NewDefSI;
+ if (LIS) {
+ SlotIndex NewDefSI =
+ LIS->InsertMachineInstrInMaps(*ReadVLMI).getRegSlot();
+ LiveInterval &DefLI = LIS->getInterval(VLOutput);
+ VNInfo *DefVNI = DefLI.getVNInfoAt(DefLI.beginIndex());
+ DefLI.removeSegment(DefLI.beginIndex(), NewDefSI);
+ DefVNI->def = NewDefSI;
+ }
}
// We don't use the vl output of the VLEFF/VLSEGFF anymore.
MI.getOperand(1).setReg(RISCV::X0);
@@ -1746,7 +1763,7 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
TII = ST->getInstrInfo();
MRI = &MF.getRegInfo();
- LIS = &getAnalysis<LiveIntervals>();
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
assert(BlockInfo.empty() && "Expect empty block infos");
BlockInfo.resize(MF.getNumBlockIDs());
@@ -1795,11 +1812,6 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF)
emitVSETVLIs(MBB);
- // Insert PseudoReadVL after VLEFF/VLSEGFF and replace it with the vl output
- // of VLEFF/VLSEGFF.
- for (MachineBasicBlock &MBB : MF)
- insertReadVL(MBB);
-
// Now that all vsetvlis are explicit, go through and do block local
// DSE and peephole based demanded fields based transforms. Note that
// this *must* be done outside the main dataflow so long as we allow
@@ -1809,6 +1821,11 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF)
coalesceVSETVLIs(MBB);
+ // Insert PseudoReadVL after VLEFF/VLSEGFF and replace it with the vl output
+ // of VLEFF/VLSEGFF.
+ for (MachineBasicBlock &MBB : MF)
+ insertReadVL(MBB);
+
BlockInfo.clear();
return HaveVectorOp;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 444b9076005c..00eb83da652b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3695,7 +3695,7 @@ void RISCVInstrInfo::mulImm(MachineFunction &MF, MachineBasicBlock &MBB,
.addReg(ScaledRegister, RegState::Kill)
.addReg(DestReg, RegState::Kill)
.setMIFlag(Flag);
- } else if (STI.hasStdExtM() || STI.hasStdExtZmmul()) {
+ } else if (STI.hasStdExtZmmul()) {
Register N = MRI.createVirtualRegister(&RISCV::GPRRegClass);
movImm(MBB, II, DL, N, Amount, Flag);
BuildMI(MBB, II, DL, get(RISCV::MUL), DestReg)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 814e0ddf111e..493e1a5fdc74 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -166,25 +166,25 @@ let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in {
}
}
-defm : AMOPat<"atomic_swap_32", "AMOSWAP_W">;
-defm : AMOPat<"atomic_load_add_32", "AMOADD_W">;
-defm : AMOPat<"atomic_load_and_32", "AMOAND_W">;
-defm : AMOPat<"atomic_load_or_32", "AMOOR_W">;
-defm : AMOPat<"atomic_load_xor_32", "AMOXOR_W">;
-defm : AMOPat<"atomic_load_max_32", "AMOMAX_W">;
-defm : AMOPat<"atomic_load_min_32", "AMOMIN_W">;
-defm : AMOPat<"atomic_load_umax_32", "AMOMAXU_W">;
-defm : AMOPat<"atomic_load_umin_32", "AMOMINU_W">;
-
-defm : AMOPat<"atomic_swap_64", "AMOSWAP_D", i64, [IsRV64]>;
-defm : AMOPat<"atomic_load_add_64", "AMOADD_D", i64, [IsRV64]>;
-defm : AMOPat<"atomic_load_and_64", "AMOAND_D", i64, [IsRV64]>;
-defm : AMOPat<"atomic_load_or_64", "AMOOR_D", i64, [IsRV64]>;
-defm : AMOPat<"atomic_load_xor_64", "AMOXOR_D", i64, [IsRV64]>;
-defm : AMOPat<"atomic_load_max_64", "AMOMAX_D", i64, [IsRV64]>;
-defm : AMOPat<"atomic_load_min_64", "AMOMIN_D", i64, [IsRV64]>;
-defm : AMOPat<"atomic_load_umax_64", "AMOMAXU_D", i64, [IsRV64]>;
-defm : AMOPat<"atomic_load_umin_64", "AMOMINU_D", i64, [IsRV64]>;
+defm : AMOPat<"atomic_swap_i32", "AMOSWAP_W">;
+defm : AMOPat<"atomic_load_add_i32", "AMOADD_W">;
+defm : AMOPat<"atomic_load_and_i32", "AMOAND_W">;
+defm : AMOPat<"atomic_load_or_i32", "AMOOR_W">;
+defm : AMOPat<"atomic_load_xor_i32", "AMOXOR_W">;
+defm : AMOPat<"atomic_load_max_i32", "AMOMAX_W">;
+defm : AMOPat<"atomic_load_min_i32", "AMOMIN_W">;
+defm : AMOPat<"atomic_load_umax_i32", "AMOMAXU_W">;
+defm : AMOPat<"atomic_load_umin_i32", "AMOMINU_W">;
+
+defm : AMOPat<"atomic_swap_i64", "AMOSWAP_D", i64, [IsRV64]>;
+defm : AMOPat<"atomic_load_add_i64", "AMOADD_D", i64, [IsRV64]>;
+defm : AMOPat<"atomic_load_and_i64", "AMOAND_D", i64, [IsRV64]>;
+defm : AMOPat<"atomic_load_or_i64", "AMOOR_D", i64, [IsRV64]>;
+defm : AMOPat<"atomic_load_xor_i64", "AMOXOR_D", i64, [IsRV64]>;
+defm : AMOPat<"atomic_load_max_i64", "AMOMAX_D", i64, [IsRV64]>;
+defm : AMOPat<"atomic_load_min_i64", "AMOMIN_D", i64, [IsRV64]>;
+defm : AMOPat<"atomic_load_umax_i64", "AMOMAXU_D", i64, [IsRV64]>;
+defm : AMOPat<"atomic_load_umin_i64", "AMOMINU_D", i64, [IsRV64]>;
/// Pseudo AMOs
@@ -243,15 +243,15 @@ let Size = 20 in
def PseudoAtomicLoadNand32 : PseudoAMO;
// Ordering constants must be kept in sync with the AtomicOrdering enum in
// AtomicOrdering.h.
-def : Pat<(XLenVT (atomic_load_nand_32_monotonic GPR:$addr, GPR:$incr)),
+def : Pat<(XLenVT (atomic_load_nand_i32_monotonic GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 2)>;
-def : Pat<(XLenVT (atomic_load_nand_32_acquire GPR:$addr, GPR:$incr)),
+def : Pat<(XLenVT (atomic_load_nand_i32_acquire GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 4)>;
-def : Pat<(XLenVT (atomic_load_nand_32_release GPR:$addr, GPR:$incr)),
+def : Pat<(XLenVT (atomic_load_nand_i32_release GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 5)>;
-def : Pat<(XLenVT (atomic_load_nand_32_acq_rel GPR:$addr, GPR:$incr)),
+def : Pat<(XLenVT (atomic_load_nand_i32_acq_rel GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 6)>;
-def : Pat<(XLenVT (atomic_load_nand_32_seq_cst GPR:$addr, GPR:$incr)),
+def : Pat<(XLenVT (atomic_load_nand_i32_seq_cst GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 7)>;
let Size = 28 in
@@ -294,15 +294,15 @@ let Size = 20 in
def PseudoAtomicLoadNand64 : PseudoAMO;
// Ordering constants must be kept in sync with the AtomicOrdering enum in
// AtomicOrdering.h.
-def : Pat<(i64 (atomic_load_nand_64_monotonic GPR:$addr, GPR:$incr)),
+def : Pat<(i64 (atomic_load_nand_i64_monotonic GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 2)>;
-def : Pat<(i64 (atomic_load_nand_64_acquire GPR:$addr, GPR:$incr)),
+def : Pat<(i64 (atomic_load_nand_i64_acquire GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 4)>;
-def : Pat<(i64 (atomic_load_nand_64_release GPR:$addr, GPR:$incr)),
+def : Pat<(i64 (atomic_load_nand_i64_release GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 5)>;
-def : Pat<(i64 (atomic_load_nand_64_acq_rel GPR:$addr, GPR:$incr)),
+def : Pat<(i64 (atomic_load_nand_i64_acq_rel GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 6)>;
-def : Pat<(i64 (atomic_load_nand_64_seq_cst GPR:$addr, GPR:$incr)),
+def : Pat<(i64 (atomic_load_nand_i64_seq_cst GPR:$addr, GPR:$incr)),
(PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 7)>;
def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_xchg_i64,
@@ -354,12 +354,12 @@ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst,
let Predicates = [HasStdExtA, NoStdExtZacas] in {
def PseudoCmpXchg32 : PseudoCmpXchg;
-defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>;
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32>;
}
let Predicates = [HasStdExtA, NoStdExtZacas, IsRV64] in {
def PseudoCmpXchg64 : PseudoCmpXchg;
-defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64, i64>;
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_i64", PseudoCmpXchg64, i64>;
}
let Predicates = [HasStdExtA] in {
@@ -422,18 +422,18 @@ let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in {
}
}
-defm : AMOPat2<"atomic_swap_32", "AMOSWAP_W", i32>;
-defm : AMOPat2<"atomic_load_add_32", "AMOADD_W", i32>;
-defm : AMOPat2<"atomic_load_and_32", "AMOAND_W", i32>;
-defm : AMOPat2<"atomic_load_or_32", "AMOOR_W", i32>;
-defm : AMOPat2<"atomic_load_xor_32", "AMOXOR_W", i32>;
-defm : AMOPat2<"atomic_load_max_32", "AMOMAX_W", i32>;
-defm : AMOPat2<"atomic_load_min_32", "AMOMIN_W", i32>;
-defm : AMOPat2<"atomic_load_umax_32", "AMOMAXU_W", i32>;
-defm : AMOPat2<"atomic_load_umin_32", "AMOMINU_W", i32>;
+defm : AMOPat2<"atomic_swap_i32", "AMOSWAP_W", i32>;
+defm : AMOPat2<"atomic_load_add_i32", "AMOADD_W", i32>;
+defm : AMOPat2<"atomic_load_and_i32", "AMOAND_W", i32>;
+defm : AMOPat2<"atomic_load_or_i32", "AMOOR_W", i32>;
+defm : AMOPat2<"atomic_load_xor_i32", "AMOXOR_W", i32>;
+defm : AMOPat2<"atomic_load_max_i32", "AMOMAX_W", i32>;
+defm : AMOPat2<"atomic_load_min_i32", "AMOMIN_W", i32>;
+defm : AMOPat2<"atomic_load_umax_i32", "AMOMAXU_W", i32>;
+defm : AMOPat2<"atomic_load_umin_i32", "AMOMINU_W", i32>;
let Predicates = [HasStdExtA, IsRV64] in
-defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32, i32>;
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32, i32>;
let Predicates = [HasAtomicLdSt] in {
def : LdPat<atomic_load_8, LB, i32>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index 8ea1560e5b37..8a2b32081dc5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -24,7 +24,7 @@ def riscv_remuw : SDNode<"RISCVISD::REMUW", SDT_RISCVIntBinOpW>;
// Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtMOrZmmul] in {
+let Predicates = [HasStdExtZmmul] in {
def MUL : ALU_rr<0b0000001, 0b000, "mul", Commutable=1>,
Sched<[WriteIMul, ReadIMul, ReadIMul]>;
def MULH : ALU_rr<0b0000001, 0b001, "mulh", Commutable=1>,
@@ -33,7 +33,7 @@ def MULHSU : ALU_rr<0b0000001, 0b010, "mulhsu">,
Sched<[WriteIMul, ReadIMul, ReadIMul]>;
def MULHU : ALU_rr<0b0000001, 0b011, "mulhu", Commutable=1>,
Sched<[WriteIMul, ReadIMul, ReadIMul]>;
-} // Predicates = [HasStdExtMOrZmmul]
+} // Predicates = [HasStdExtZmmul]
let Predicates = [HasStdExtM] in {
def DIV : ALU_rr<0b0000001, 0b100, "div">,
@@ -46,10 +46,10 @@ def REMU : ALU_rr<0b0000001, 0b111, "remu">,
Sched<[WriteIRem, ReadIRem, ReadIRem]>;
} // Predicates = [HasStdExtM]
-let Predicates = [HasStdExtMOrZmmul, IsRV64], IsSignExtendingOpW = 1 in {
+let Predicates = [HasStdExtZmmul, IsRV64], IsSignExtendingOpW = 1 in {
def MULW : ALUW_rr<0b0000001, 0b000, "mulw", Commutable=1>,
Sched<[WriteIMul32, ReadIMul32, ReadIMul32]>;
-} // Predicates = [HasStdExtMOrZmmul, IsRV64]
+} // Predicates = [HasStdExtZmmul, IsRV64]
let Predicates = [HasStdExtM, IsRV64], IsSignExtendingOpW = 1 in {
def DIVW : ALUW_rr<0b0000001, 0b100, "divw">,
@@ -66,12 +66,12 @@ def REMUW : ALUW_rr<0b0000001, 0b111, "remuw">,
// Pseudo-instructions and codegen patterns
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtMOrZmmul] in {
+let Predicates = [HasStdExtZmmul] in {
def : PatGprGpr<mul, MUL>;
def : PatGprGpr<mulhs, MULH>;
def : PatGprGpr<mulhu, MULHU>;
def : PatGprGpr<riscv_mulhsu, MULHSU>;
-} // Predicates = [HasStdExtMOrZmmul]
+} // Predicates = [HasStdExtZmmul]
let Predicates = [HasStdExtM] in {
def : PatGprGpr<sdiv, DIV>;
@@ -81,7 +81,7 @@ def : PatGprGpr<urem, REMU>;
} // Predicates = [HasStdExtM]
// Select W instructions if only the lower 32-bits of the result are used.
-let Predicates = [HasStdExtMOrZmmul, IsRV64] in
+let Predicates = [HasStdExtZmmul, IsRV64] in
def : PatGprGpr<binop_allwusers<mul>, MULW>;
let Predicates = [HasStdExtM, IsRV64] in {
@@ -106,20 +106,20 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))),
(REMW GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtM, IsRV64]
-let Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] in {
+let Predicates = [HasStdExtZmmul, IsRV64, NotHasStdExtZba] in {
// Special case for calculating the full 64-bit product of a 32x32 unsigned
// multiply where the inputs aren't known to be zero extended. We can shift the
// inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish
// zeroing the upper 32 bits.
def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
(MULHU (i64 (SLLI GPR:$rs1, 32)), (i64 (SLLI GPR:$rs2, 32)))>;
-} // Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba]
+} // Predicates = [HasStdExtZmmul, IsRV64, NotHasStdExtZba]
//===----------------------------------------------------------------------===//
// Experimental RV64 i32 legalization patterns.
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtMOrZmmul, IsRV64] in {
+let Predicates = [HasStdExtZmmul, IsRV64] in {
def : PatGprGpr<mul, MULW, i32, i32>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 7a95ebad364c..45a57d117081 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -67,11 +67,6 @@
/// that terminology in code frequently refers to these as "TA" which is
/// confusing. We're in the process of migrating away from this
/// representation.
-/// * _TU w/o policy operand -- Has a passthrough operand, and always
-/// represents the tail undisturbed state.
-/// * _TU w/policy operand - Can represent all three policy states. If
-/// passthrough is IMPLICIT_DEF (or NoReg), then represents "undefined".
-/// Otherwise, policy operand and tablegen flags drive the interpretation.
///
//===----------------------------------------------------------------------===//
@@ -387,17 +382,6 @@ class GetIntVTypeInfo<VTypeInfo vti> {
!cast<string>(vti))));
}
-// This functor is used to obtain the fp vector type that has the same SEW and
-// multiplier as the input parameter type.
-class GetFpVTypeInfo<VTypeInfo vti> {
- // Equivalent integer vector type. Eg.
- // VF16M1 → VF16M1 (identity)
- // VBF16M1 → VF16M1
- VTypeInfo Vti = !cast<VTypeInfo>(!subst("VBF", "VF",
- !subst("VI", "VF",
- !cast<string>(vti))));
-}
-
class MTypeInfo<ValueType Mas, LMULInfo M, string Bx> {
ValueType Mask = Mas;
// {SEW, VLMul} values set a valid VType to deal with this mask type.
@@ -774,11 +758,6 @@ class GetVTypePredicates<VTypeInfo vti> {
true : [HasVInstructions]);
}
-class GetVTypeScalarPredicates<VTypeInfo vti> {
- list<Predicate> Predicates = !cond(!eq(vti.Scalar, bf16) : [HasStdExtZfbfmin],
- true : []);
-}
-
class VPseudoUSLoadNoMask<VReg RetClass,
int EEW> :
Pseudo<(outs RetClass:$rd),
@@ -1373,23 +1352,6 @@ class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
let HasSEWOp = 1;
}
-class VPseudoBinaryMask<VReg RetClass,
- RegisterClass Op1Class,
- DAGOperand Op2Class,
- string Constraint> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$merge,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
- RISCVVPseudo {
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
- let HasVLOp = 1;
- let HasSEWOp = 1;
-}
-
class VPseudoBinaryMaskPolicy<VReg RetClass,
RegisterClass Op1Class,
DAGOperand Op2Class,
@@ -1449,7 +1411,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
let UsesVXRM = 0;
}
-// Like VPseudoBinaryMask, but output can be V0.
+// Like VPseudoBinaryMaskPolicy, but output can be V0 and there is no policy.
class VPseudoBinaryMOutMask<VReg RetClass,
RegisterClass Op1Class,
DAGOperand Op2Class,
@@ -1470,8 +1432,8 @@ class VPseudoBinaryMOutMask<VReg RetClass,
let UsesMaskPolicy = 1;
}
-// Special version of VPseudoBinaryMask where we pretend the first source is
-// tied to the destination so we can workaround the earlyclobber constraint.
+// Special version of VPseudoBinaryMaskPolicy where we pretend the first source
+// is tied to the destination so we can workaround the earlyclobber constraint.
// This allows maskedoff and rs2 to be the same register.
class VPseudoTiedBinaryMask<VReg RetClass,
DAGOperand Op2Class,
@@ -4214,16 +4176,16 @@ class VPatBinaryMaskPolicy<string intrinsic_name,
(op2_type op2_kind:$rs2),
(mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
-class VPatBinaryMaskTARoundingMode<string intrinsic_name,
- string inst,
- ValueType result_type,
- ValueType op1_type,
- ValueType op2_type,
- ValueType mask_type,
- int sew,
- VReg result_reg_class,
- VReg op1_reg_class,
- DAGOperand op2_kind> :
+class VPatBinaryMaskPolicyRoundingMode<string intrinsic_name,
+ string inst,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ VReg result_reg_class,
+ VReg op1_reg_class,
+ DAGOperand op2_kind> :
Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
(result_type result_reg_class:$merge),
(op1_type op1_reg_class:$rs1),
@@ -4375,28 +4337,6 @@ class VPatTiedBinaryMaskRoundingMode<string intrinsic_name,
(XLenVT timm:$round),
GPR:$vl, sew, (XLenVT timm:$policy))>;
-class VPatTernaryNoMask<string intrinsic,
- string inst,
- string kind,
- ValueType result_type,
- ValueType op1_type,
- ValueType op2_type,
- int sew,
- LMULInfo vlmul,
- VReg result_reg_class,
- RegisterClass op1_reg_class,
- DAGOperand op2_kind> :
- Pat<(result_type (!cast<Intrinsic>(intrinsic)
- (result_type result_reg_class:$rs3),
- (op1_type op1_reg_class:$rs1),
- (op2_type op2_kind:$rs2),
- VLOpFrag)),
- (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
- result_reg_class:$rs3,
- (op1_type op1_reg_class:$rs1),
- op2_kind:$rs2,
- GPR:$vl, sew)>;
-
class VPatTernaryNoMaskTU<string intrinsic,
string inst,
string kind,
@@ -4492,31 +4432,6 @@ class VPatTernaryNoMaskWithPolicyRoundingMode<string intrinsic,
(XLenVT timm:$round),
GPR:$vl, log2sew, (XLenVT timm:$policy))>;
-class VPatTernaryMask<string intrinsic,
- string inst,
- string kind,
- ValueType result_type,
- ValueType op1_type,
- ValueType op2_type,
- ValueType mask_type,
- int sew,
- LMULInfo vlmul,
- VReg result_reg_class,
- RegisterClass op1_reg_class,
- DAGOperand op2_kind> :
- Pat<(result_type (!cast<Intrinsic>(intrinsic#"_mask")
- (result_type result_reg_class:$rs3),
- (op1_type op1_reg_class:$rs1),
- (op2_type op2_kind:$rs2),
- (mask_type V0),
- VLOpFrag)),
- (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
- result_reg_class:$rs3,
- (op1_type op1_reg_class:$rs1),
- op2_kind:$rs2,
- (mask_type V0),
- GPR:$vl, sew)>;
-
class VPatTernaryMaskPolicy<string intrinsic,
string inst,
string kind,
@@ -4784,9 +4699,9 @@ multiclass VPatBinaryRoundingMode<string intrinsic,
DAGOperand op2_kind> {
def : VPatBinaryNoMaskTURoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
sew, result_reg_class, op1_reg_class, op2_kind>;
- def : VPatBinaryMaskTARoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
- mask_type, sew, result_reg_class, op1_reg_class,
- op2_kind>;
+ def : VPatBinaryMaskPolicyRoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
+ mask_type, sew, result_reg_class, op1_reg_class,
+ op2_kind>;
}
multiclass VPatBinaryMSwapped<string intrinsic,
@@ -5158,10 +5073,10 @@ multiclass VPatBinaryW_WV_RM<string intrinsic, string instruction,
Wti.Vector, Vti.Vector, Vti.Mask,
Vti.Log2SEW, Wti.RegClass, Vti.RegClass>;
}
- def : VPatBinaryMaskTARoundingMode<intrinsic, name,
- Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
- Vti.Log2SEW, Wti.RegClass,
- Wti.RegClass, Vti.RegClass>;
+ def : VPatBinaryMaskPolicyRoundingMode<intrinsic, name,
+ Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
+ Vti.Log2SEW, Wti.RegClass,
+ Wti.RegClass, Vti.RegClass>;
}
}
}
@@ -5513,46 +5428,6 @@ multiclass VPatBinaryM_V_X<string intrinsic, string instruction>
: VPatBinaryV_V<intrinsic, instruction>,
VPatBinaryV_X<intrinsic, instruction>;
-multiclass VPatTernary<string intrinsic,
- string inst,
- string kind,
- ValueType result_type,
- ValueType op1_type,
- ValueType op2_type,
- ValueType mask_type,
- int sew,
- LMULInfo vlmul,
- VReg result_reg_class,
- RegisterClass op1_reg_class,
- DAGOperand op2_kind> {
- def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
- sew, vlmul, result_reg_class, op1_reg_class,
- op2_kind>;
- def : VPatTernaryMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
- mask_type, sew, vlmul, result_reg_class, op1_reg_class,
- op2_kind>;
-}
-
-multiclass VPatTernaryNoMaskNoPolicy<string intrinsic,
- string inst,
- string kind,
- ValueType result_type,
- ValueType op1_type,
- ValueType op2_type,
- ValueType mask_type,
- int sew,
- LMULInfo vlmul,
- VReg result_reg_class,
- RegisterClass op1_reg_class,
- DAGOperand op2_kind> {
- def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
- sew, vlmul, result_reg_class, op1_reg_class,
- op2_kind>;
- def : VPatTernaryMaskPolicy<intrinsic, inst, kind, result_type, op1_type, op2_type,
- mask_type, sew, vlmul, result_reg_class, op1_reg_class,
- op2_kind>;
-}
-
multiclass VPatTernaryWithPolicy<string intrinsic,
string inst,
string kind,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 497c4aadf753..e82625f085be 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -1405,16 +1405,23 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in {
fvti.AVL, fvti.Log2SEW)>;
def : Pat<(fvti.Vector (vselect (fvti.Mask V0),
+ (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))),
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
+ (fvti.Vector (IMPLICIT_DEF)),
+ fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (vselect (fvti.Mask V0),
(SplatFPOp (fvti.Scalar fpimm0)),
fvti.RegClass:$rs2)),
(!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
(fvti.Vector (IMPLICIT_DEF)),
fvti.RegClass:$rs2, 0, (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>;
-
}
+}
- let Predicates = !listconcat(GetVTypePredicates<GetFpVTypeInfo<fvti>.Vti>.Predicates,
- GetVTypeScalarPredicates<fvti>.Predicates) in
+foreach fvti = AllFloatVectors in {
+ let Predicates = GetVTypePredicates<fvti>.Predicates in
def : Pat<(fvti.Vector (vselect (fvti.Mask V0),
(SplatFPOp fvti.ScalarRegClass:$rs1),
fvti.RegClass:$rs2)),
@@ -1473,26 +1480,6 @@ foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
}
//===----------------------------------------------------------------------===//
-// Vector Splats
-//===----------------------------------------------------------------------===//
-
-foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in {
- let Predicates = !listconcat(GetVTypePredicates<GetFpVTypeInfo<fvti>.Vti>.Predicates,
- GetVTypeScalarPredicates<fvti>.Predicates) in
- def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl undef, fvti.ScalarRegClass:$rs1, srcvalue)),
- (!cast<Instruction>("PseudoVFMV_V_"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
- (fvti.Vector (IMPLICIT_DEF)),
- (fvti.Scalar fvti.ScalarRegClass:$rs1),
- fvti.AVL, fvti.Log2SEW, TA_MA)>;
- defvar ivti = GetIntVTypeInfo<fvti>.Vti;
- let Predicates = GetVTypePredicates<ivti>.Predicates in
- def : Pat<(fvti.Vector (SplatFPOp (fvti.Scalar fpimm0))),
- (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
- (fvti.Vector (IMPLICIT_DEF)),
- 0, fvti.AVL, fvti.Log2SEW, TA_MA)>;
-}
-
-//===----------------------------------------------------------------------===//
// Vector Element Extracts
//===----------------------------------------------------------------------===//
foreach vti = AllFloatVectors in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 70d8265e7be4..a7945f2ee6c1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2638,9 +2638,10 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in {
fvti.RegClass:$merge, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
GPR:$vl, fvti.Log2SEW)>;
}
+}
- let Predicates = !listconcat(GetVTypePredicates<GetFpVTypeInfo<fvti>.Vti>.Predicates,
- GetVTypeScalarPredicates<fvti>.Predicates) in {
+foreach fvti = AllFloatVectors in {
+ let Predicates = GetVTypePredicates<fvti>.Predicates in {
def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
(SplatFPOp fvti.ScalarRegClass:$rs1),
fvti.RegClass:$rs2,
@@ -2654,8 +2655,8 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in {
}
foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in {
- let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
- GetVTypeScalarPredicates<fvti>.Predicates) in {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+ let Predicates = GetVTypePredicates<ivti>.Predicates in {
// 13.16. Vector Floating-Point Move Instruction
// If we're splatting fpimm0, use vmv.v.x vd, x0.
def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
@@ -2666,7 +2667,11 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in {
fvti.Vector:$passthru, (fvti.Scalar (SelectFPImm (XLenVT GPR:$imm))), VLOpFrag)),
(!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX)
$passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>;
+ }
+}
+foreach fvti = AllFloatVectors in {
+ let Predicates = GetVTypePredicates<fvti>.Predicates in {
def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
(!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
index 0cd41cac218f..1ee78359bc4a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td
@@ -116,8 +116,8 @@ multiclass AMOCASPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT,
} // Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds)
}
-defm : AMOCASPat<"atomic_cmp_swap_32", "AMOCAS_W">;
-defm : AMOCASPat<"atomic_cmp_swap_64", "AMOCAS_D_RV64", i64, [IsRV64]>;
+defm : AMOCASPat<"atomic_cmp_swap_i32", "AMOCAS_W">;
+defm : AMOCASPat<"atomic_cmp_swap_i64", "AMOCAS_D_RV64", i64, [IsRV64]>;
//===----------------------------------------------------------------------===//
// Zawrs (Wait-on-Reservation-Set)
@@ -188,27 +188,27 @@ defm AMOCAS_H : AMO_cas_aq_rl<0b00101, 0b001, "amocas.h", GPR>;
/// AMOs
-defm : AMOPat<"atomic_swap_8", "AMOSWAP_B", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_add_8", "AMOADD_B", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_and_8", "AMOAND_B", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_or_8", "AMOOR_B", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_xor_8", "AMOXOR_B", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_max_8", "AMOMAX_B", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_min_8", "AMOMIN_B", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_umax_8", "AMOMAXU_B", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_umin_8", "AMOMINU_B", XLenVT, [HasStdExtZabha]>;
-
-defm : AMOPat<"atomic_swap_16", "AMOSWAP_H", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_add_16", "AMOADD_H", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_and_16", "AMOAND_H", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_or_16", "AMOOR_H", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_xor_16", "AMOXOR_H", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_max_16", "AMOMAX_H", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_min_16", "AMOMIN_H", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_umax_16", "AMOMAXU_H", XLenVT, [HasStdExtZabha]>;
-defm : AMOPat<"atomic_load_umin_16", "AMOMINU_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_swap_i8", "AMOSWAP_B", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_add_i8", "AMOADD_B", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_and_i8", "AMOAND_B", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_or_i8", "AMOOR_B", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_xor_i8", "AMOXOR_B", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_max_i8", "AMOMAX_B", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_min_i8", "AMOMIN_B", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_umax_i8", "AMOMAXU_B", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_umin_i8", "AMOMINU_B", XLenVT, [HasStdExtZabha]>;
+
+defm : AMOPat<"atomic_swap_i16", "AMOSWAP_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_add_i16", "AMOADD_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_and_i16", "AMOAND_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_or_i16", "AMOOR_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_xor_i16", "AMOXOR_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_max_i16", "AMOMAX_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_min_i16", "AMOMIN_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_umax_i16", "AMOMAXU_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOPat<"atomic_load_umin_i16", "AMOMINU_H", XLenVT, [HasStdExtZabha]>;
/// AMOCAS
-defm : AMOCASPat<"atomic_cmp_swap_8", "AMOCAS_B", XLenVT, [HasStdExtZabha]>;
-defm : AMOCASPat<"atomic_cmp_swap_16", "AMOCAS_H", XLenVT, [HasStdExtZabha]>;
+defm : AMOCASPat<"atomic_cmp_swap_i8", "AMOCAS_B", XLenVT, [HasStdExtZabha]>;
+defm : AMOCASPat<"atomic_cmp_swap_i16", "AMOCAS_H", XLenVT, [HasStdExtZabha]>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index 2a4448d7881f..11c2695a5985 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -183,7 +183,7 @@ let Predicates = [HasStdExtZcb] in
def C_ZEXT_B : RVZcArith_r<0b11000 , "c.zext.b">,
Sched<[WriteIALU, ReadIALU]>;
-let Predicates = [HasStdExtZcb, HasStdExtMOrZmmul] in
+let Predicates = [HasStdExtZcb, HasStdExtZmmul] in
def C_MUL : CA_ALU<0b100111, 0b10, "c.mul", GPRC>,
Sched<[WriteIMul, ReadIMul, ReadIMul]>;
@@ -270,13 +270,13 @@ def CM_JALT : RVInst16CJ<0b101, 0b10, (outs), (ins uimm8ge32:$index),
} // DecoderNamespace = "RVZcmt", Predicates = [HasStdExtZcmt]...
-let Predicates = [HasStdExtZcb, HasStdExtMOrZmmul] in{
+let Predicates = [HasStdExtZcb, HasStdExtZmmul] in{
def : CompressPat<(MUL GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
(C_MUL GPRC:$rs1, GPRC:$rs2)>;
let isCompressOnly = true in
def : CompressPat<(MUL GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
(C_MUL GPRC:$rs1, GPRC:$rs2)>;
-} // Predicates = [HasStdExtZcb, HasStdExtMOrZmmul]
+} // Predicates = [HasStdExtZcb, HasStdExtZmmul]
let Predicates = [HasStdExtZcb, HasStdExtZbb] in{
def : CompressPat<(SEXT_B GPRC:$rs1, GPRC:$rs1),
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index fcc20c17c6b4..779c652b4d8f 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -135,7 +135,7 @@ public:
bool isPushable(const MachineFunction &MF) const {
// We cannot use fixed locations for the callee saved spill slots if the
// function uses a varargs save area.
- // TODO: Use a seperate placement for vararg registers to enable Zcmp.
+ // TODO: Use a separate placement for vararg registers to enable Zcmp.
return MF.getSubtarget<RISCVSubtarget>().hasStdExtZcmp() &&
!MF.getTarget().Options.DisableFramePointerElim(MF) &&
VarArgsSaveSize == 0;
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 6ebf9f1eb045..13a2491116b5 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -326,6 +326,27 @@ def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max",
FeatureStdExtC],
[TuneNoDefaultUnroll]>;
+def SYNTACORE_SCR3_RV32 : RISCVProcessorModel<"syntacore-scr3-rv32",
+ SyntacoreSCR3RV32Model,
+ [Feature32Bit,
+ FeatureStdExtI,
+ FeatureStdExtZicsr,
+ FeatureStdExtZifencei,
+ FeatureStdExtM,
+ FeatureStdExtC],
+ [TuneNoDefaultUnroll, FeaturePostRAScheduler]>;
+
+def SYNTACORE_SCR3_RV64 : RISCVProcessorModel<"syntacore-scr3-rv64",
+ SyntacoreSCR3RV64Model,
+ [Feature64Bit,
+ FeatureStdExtI,
+ FeatureStdExtZicsr,
+ FeatureStdExtZifencei,
+ FeatureStdExtM,
+ FeatureStdExtA,
+ FeatureStdExtC],
+ [TuneNoDefaultUnroll, FeaturePostRAScheduler]>;
+
def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
NoSchedModel,
[Feature64Bit,
@@ -381,3 +402,19 @@ def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
TuneZExtHFusion,
TuneZExtWFusion,
TuneShiftedZExtWFusion]>;
+
+def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
+ NoSchedModel,
+ !listconcat(RVA22S64Features,
+ [FeatureStdExtV,
+ FeatureStdExtSscofpmf,
+ FeatureStdExtSstc,
+ FeatureStdExtSvnapot,
+ FeatureStdExtZbc,
+ FeatureStdExtZbkc,
+ FeatureStdExtZfh,
+ FeatureStdExtZicond,
+ FeatureStdExtZvfh,
+ FeatureStdExtZvkt,
+ FeatureStdExtZvl256b]),
+ [TuneDLenFactor2]>;
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index caa5dbc15f8b..760d12103c36 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -104,14 +104,17 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
- // Mark any registers requested to be reserved as such
for (size_t Reg = 0; Reg < getNumRegs(); Reg++) {
+ // Mark any GPRs requested to be reserved as such
if (Subtarget.isRegisterReservedByUser(Reg))
markSuperRegs(Reserved, Reg);
+
+ // Mark all the registers defined as constant in TableGen as reserved.
+ if (isConstantPhysReg(Reg))
+ markSuperRegs(Reserved, Reg);
}
// Use markSuperRegs to ensure any register aliases are also reserved
- markSuperRegs(Reserved, RISCV::X0); // zero
markSuperRegs(Reserved, RISCV::X2); // sp
markSuperRegs(Reserved, RISCV::X3); // gp
markSuperRegs(Reserved, RISCV::X4); // tp
@@ -136,7 +139,6 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
markSuperRegs(Reserved, RISCV::VTYPE);
markSuperRegs(Reserved, RISCV::VXSAT);
markSuperRegs(Reserved, RISCV::VXRM);
- markSuperRegs(Reserved, RISCV::VLENB); // vlenb (constant)
// Floating point environment registers.
markSuperRegs(Reserved, RISCV::FRM);
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
index 31112d140cde..dc20fdcea4d7 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
@@ -76,58 +76,6 @@ def : WriteRes<WriteLDW, [SCR1_LSU]>;
def : WriteRes<WriteLDD, [SCR1_LSU]>;
}
-let Unsupported = true in {
-// Atomic memory
-def : WriteRes<WriteAtomicW, [SCR1_LSU]>;
-def : WriteRes<WriteAtomicD, [SCR1_LSU]>;
-def : WriteRes<WriteAtomicLDW, [SCR1_LSU]>;
-def : WriteRes<WriteAtomicLDD, [SCR1_LSU]>;
-def : WriteRes<WriteAtomicSTW, [SCR1_LSU]>;
-def : WriteRes<WriteAtomicSTD, [SCR1_LSU]>;
-
-// FP load/store
-def : WriteRes<WriteFST32, [SCR1_LSU]>;
-def : WriteRes<WriteFST64, [SCR1_LSU]>;
-def : WriteRes<WriteFLD32, [SCR1_LSU]>;
-def : WriteRes<WriteFLD64, [SCR1_LSU]>;
-
-// FP instructions
-def : WriteRes<WriteFAdd32, []>;
-def : WriteRes<WriteFSGNJ32, []>;
-def : WriteRes<WriteFMinMax32, []>;
-def : WriteRes<WriteFAdd64, []>;
-def : WriteRes<WriteFSGNJ64, []>;
-def : WriteRes<WriteFMinMax64, []>;
-def : WriteRes<WriteFCvtI32ToF32, []>;
-def : WriteRes<WriteFCvtI32ToF64, []>;
-def : WriteRes<WriteFCvtI64ToF32, []>;
-def : WriteRes<WriteFCvtI64ToF64, []>;
-def : WriteRes<WriteFCvtF32ToI32, []>;
-def : WriteRes<WriteFCvtF32ToI64, []>;
-def : WriteRes<WriteFCvtF64ToI32, []>;
-def : WriteRes<WriteFCvtF64ToI64, []>;
-def : WriteRes<WriteFCvtF32ToF64, []>;
-def : WriteRes<WriteFCvtF64ToF32, []>;
-def : WriteRes<WriteFClass32, []>;
-def : WriteRes<WriteFClass64, []>;
-def : WriteRes<WriteFCmp32, []>;
-def : WriteRes<WriteFCmp64, []>;
-def : WriteRes<WriteFMovF32ToI32, []>;
-def : WriteRes<WriteFMovI32ToF32, []>;
-def : WriteRes<WriteFMovF64ToI64, []>;
-def : WriteRes<WriteFMovI64ToF64, []>;
-def : WriteRes<WriteFMul32, []>;
-def : WriteRes<WriteFMA32, []>;
-def : WriteRes<WriteFMul64, []>;
-def : WriteRes<WriteFMA64, []>;
-def : WriteRes<WriteFDiv32, []>;
-def : WriteRes<WriteFDiv64, []>;
-def : WriteRes<WriteFSqrt32, []>;
-def : WriteRes<WriteFSqrt64, []>;
-
-def : WriteRes<WriteSFB, []>;
-}
-
// Others
def : WriteRes<WriteCSR, []>;
def : WriteRes<WriteNop, []>;
@@ -153,55 +101,13 @@ def : ReadAdvance<ReadIRem, 0>;
def : ReadAdvance<ReadIRem32, 0>;
def : ReadAdvance<ReadIMul, 0>;
def : ReadAdvance<ReadIMul32, 0>;
-def : ReadAdvance<ReadAtomicWA, 0>;
-def : ReadAdvance<ReadAtomicWD, 0>;
-def : ReadAdvance<ReadAtomicDA, 0>;
-def : ReadAdvance<ReadAtomicDD, 0>;
-def : ReadAdvance<ReadAtomicLDW, 0>;
-def : ReadAdvance<ReadAtomicLDD, 0>;
-def : ReadAdvance<ReadAtomicSTW, 0>;
-def : ReadAdvance<ReadAtomicSTD, 0>;
-def : ReadAdvance<ReadFStoreData, 0>;
-def : ReadAdvance<ReadFMemBase, 0>;
-def : ReadAdvance<ReadFAdd32, 0>;
-def : ReadAdvance<ReadFAdd64, 0>;
-def : ReadAdvance<ReadFMul32, 0>;
-def : ReadAdvance<ReadFMul64, 0>;
-def : ReadAdvance<ReadFMA32, 0>;
-def : ReadAdvance<ReadFMA32Addend, 0>;
-def : ReadAdvance<ReadFMA64, 0>;
-def : ReadAdvance<ReadFMA64Addend, 0>;
-def : ReadAdvance<ReadFDiv32, 0>;
-def : ReadAdvance<ReadFDiv64, 0>;
-def : ReadAdvance<ReadFSqrt32, 0>;
-def : ReadAdvance<ReadFSqrt64, 0>;
-def : ReadAdvance<ReadFCmp32, 0>;
-def : ReadAdvance<ReadFCmp64, 0>;
-def : ReadAdvance<ReadFSGNJ32, 0>;
-def : ReadAdvance<ReadFSGNJ64, 0>;
-def : ReadAdvance<ReadFMinMax32, 0>;
-def : ReadAdvance<ReadFMinMax64, 0>;
-def : ReadAdvance<ReadFCvtF32ToI32, 0>;
-def : ReadAdvance<ReadFCvtF32ToI64, 0>;
-def : ReadAdvance<ReadFCvtF64ToI32, 0>;
-def : ReadAdvance<ReadFCvtF64ToI64, 0>;
-def : ReadAdvance<ReadFCvtI32ToF32, 0>;
-def : ReadAdvance<ReadFCvtI32ToF64, 0>;
-def : ReadAdvance<ReadFCvtI64ToF32, 0>;
-def : ReadAdvance<ReadFCvtI64ToF64, 0>;
-def : ReadAdvance<ReadFCvtF32ToF64, 0>;
-def : ReadAdvance<ReadFCvtF64ToF32, 0>;
-def : ReadAdvance<ReadFMovF32ToI32, 0>;
-def : ReadAdvance<ReadFMovI32ToF32, 0>;
-def : ReadAdvance<ReadFMovF64ToI64, 0>;
-def : ReadAdvance<ReadFMovI64ToF64, 0>;
-def : ReadAdvance<ReadFClass32, 0>;
-def : ReadAdvance<ReadFClass64, 0>;
-def : ReadAdvance<ReadSFBJmp, 0>;
-def : ReadAdvance<ReadSFBALU, 0>;
//===----------------------------------------------------------------------===//
// Unsupported extensions
+defm : UnsupportedSchedA;
+defm : UnsupportedSchedD;
+defm : UnsupportedSchedF;
+defm : UnsupportedSchedSFB;
defm : UnsupportedSchedV;
defm : UnsupportedSchedZabha;
defm : UnsupportedSchedZba;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td
new file mode 100644
index 000000000000..607637bc0de5
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td
@@ -0,0 +1,189 @@
+//==- RISCVSchedSyntacoreSCR3.td - Syntacore SCR3 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// This model covers SYNTACORE_SCR3_RV32IMC and SYNTACORE_RV64IMAC
+// configurations (syntacore-scr3-rv32/64).
+// Overview: https://syntacore.com/products/scr3
+
+// SCR3 is single-issue in-order processor
+class SyntacoreSCR3Model : SchedMachineModel {
+ let MicroOpBufferSize = 0;
+ let IssueWidth = 1;
+ let LoadLatency = 2;
+ let MispredictPenalty = 3;
+ let CompleteModel = 0;
+ let UnsupportedFeatures = [HasStdExtD, HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+ HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+ HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
+ HasVInstructions];
+}
+
+// Branching
+multiclass SCR3_Branching<ProcResourceKind BRU> {
+ def : WriteRes<WriteJmp, [BRU]>;
+ def : WriteRes<WriteJal, [BRU]>;
+ def : WriteRes<WriteJalr, [BRU]>;
+}
+
+// Single-cycle integer arithmetic and logic
+multiclass SCR3_IntALU<ProcResourceKind ALU> {
+ def : WriteRes<WriteIALU, [ALU]>;
+ def : WriteRes<WriteIALU32, [ALU]>;
+ def : WriteRes<WriteShiftImm, [ALU]>;
+ def : WriteRes<WriteShiftImm32, [ALU]>;
+ def : WriteRes<WriteShiftReg, [ALU]>;
+ def : WriteRes<WriteShiftReg32, [ALU]>;
+}
+
+// Integer multiplication
+multiclass SCR3_IntMul<ProcResourceKind MUL> {
+ let Latency = 2 in {
+ def : WriteRes<WriteIMul, [MUL]>;
+ def : WriteRes<WriteIMul32, [MUL]>;
+ }
+}
+
+// Integer division
+multiclass SCR3_IntDiv<ProcResourceKind DIV, int DivLatency> {
+ let Latency = DivLatency, ReleaseAtCycles = [DivLatency] in {
+ def : WriteRes<WriteIDiv, [DIV]>;
+ def : WriteRes<WriteIDiv32, [DIV]>;
+ def : WriteRes<WriteIRem, [DIV]>;
+ def : WriteRes<WriteIRem32, [DIV]>;
+ }
+}
+
+// Load/store instructions on SCR3 have latency 2
+multiclass SCR3_Memory<ProcResourceKind LSU> {
+ let Latency = 2 in {
+ def : WriteRes<WriteSTB, [LSU]>;
+ def : WriteRes<WriteSTH, [LSU]>;
+ def : WriteRes<WriteSTW, [LSU]>;
+ def : WriteRes<WriteSTD, [LSU]>;
+ def : WriteRes<WriteLDB, [LSU]>;
+ def : WriteRes<WriteLDH, [LSU]>;
+ def : WriteRes<WriteLDW, [LSU]>;
+ def : WriteRes<WriteLDD, [LSU]>;
+ }
+}
+
+// Atomic memory
+multiclass SCR3_AtomicMemory<ProcResourceKind LSU> {
+ let Latency = 20 in {
+ def : WriteRes<WriteAtomicLDW, [LSU]>;
+ def : WriteRes<WriteAtomicLDD, [LSU]>;
+ def : WriteRes<WriteAtomicW, [LSU]>;
+ def : WriteRes<WriteAtomicD, [LSU]>;
+ def : WriteRes<WriteAtomicSTW, [LSU]>;
+ def : WriteRes<WriteAtomicSTD, [LSU]>;
+ }
+}
+
+// Others
+multiclass SCR3_Other {
+ def : WriteRes<WriteCSR, []>;
+ def : WriteRes<WriteNop, []>;
+
+ def : InstRW<[WriteIALU], (instrs COPY)>;
+}
+
+
+multiclass SCR3_Unsupported {
+ defm : UnsupportedSchedD;
+ defm : UnsupportedSchedF;
+ defm : UnsupportedSchedSFB;
+ defm : UnsupportedSchedV;
+ defm : UnsupportedSchedXsfvcp;
+ defm : UnsupportedSchedZabha;
+ defm : UnsupportedSchedZba;
+ defm : UnsupportedSchedZbb;
+ defm : UnsupportedSchedZbc;
+ defm : UnsupportedSchedZbs;
+ defm : UnsupportedSchedZbkb;
+ defm : UnsupportedSchedZbkx;
+ defm : UnsupportedSchedZfa;
+ defm : UnsupportedSchedZfh;
+ defm : UnsupportedSchedZvk;
+}
+
+// Bypasses (none)
+multiclass SCR3_NoReadAdvances {
+ def : ReadAdvance<ReadJmp, 0>;
+ def : ReadAdvance<ReadJalr, 0>;
+ def : ReadAdvance<ReadCSR, 0>;
+ def : ReadAdvance<ReadStoreData, 0>;
+ def : ReadAdvance<ReadMemBase, 0>;
+ def : ReadAdvance<ReadIALU, 0>;
+ def : ReadAdvance<ReadIALU32, 0>;
+ def : ReadAdvance<ReadShiftImm, 0>;
+ def : ReadAdvance<ReadShiftImm32, 0>;
+ def : ReadAdvance<ReadShiftReg, 0>;
+ def : ReadAdvance<ReadShiftReg32, 0>;
+ def : ReadAdvance<ReadIDiv, 0>;
+ def : ReadAdvance<ReadIDiv32, 0>;
+ def : ReadAdvance<ReadIRem, 0>;
+ def : ReadAdvance<ReadIRem32, 0>;
+ def : ReadAdvance<ReadIMul, 0>;
+ def : ReadAdvance<ReadIMul32, 0>;
+ def : ReadAdvance<ReadAtomicWA, 0>;
+ def : ReadAdvance<ReadAtomicWD, 0>;
+ def : ReadAdvance<ReadAtomicDA, 0>;
+ def : ReadAdvance<ReadAtomicDD, 0>;
+ def : ReadAdvance<ReadAtomicLDW, 0>;
+ def : ReadAdvance<ReadAtomicLDD, 0>;
+ def : ReadAdvance<ReadAtomicSTW, 0>;
+ def : ReadAdvance<ReadAtomicSTD, 0>;
+}
+
+def SyntacoreSCR3RV32Model : SyntacoreSCR3Model;
+
+let SchedModel = SyntacoreSCR3RV32Model in {
+ let BufferSize = 0 in {
+ def SCR3RV32_ALU : ProcResource<1>;
+ def SCR3RV32_MUL : ProcResource<1>;
+ def SCR3RV32_DIV : ProcResource<1>;
+ def SCR3RV32_LSU : ProcResource<1>;
+ def SCR3RV32_CFU : ProcResource<1>;
+ }
+
+ defm : SCR3_Branching<SCR3RV32_CFU>;
+ defm : SCR3_IntALU<SCR3RV32_ALU>;
+ defm : SCR3_IntMul<SCR3RV32_MUL>;
+ defm : SCR3_IntDiv<SCR3RV32_DIV, /* div latency = */ 8>;
+ defm : SCR3_Memory<SCR3RV32_LSU>;
+ defm : SCR3_AtomicMemory<SCR3RV32_LSU>;
+ defm : SCR3_Other;
+
+ defm : SCR3_Unsupported;
+ defm : SCR3_NoReadAdvances;
+}
+
+def SyntacoreSCR3RV64Model : SyntacoreSCR3Model;
+
+let SchedModel = SyntacoreSCR3RV64Model in {
+ let BufferSize = 0 in {
+ def SCR3RV64_ALU : ProcResource<1>;
+ def SCR3RV64_MUL : ProcResource<1>;
+ def SCR3RV64_DIV : ProcResource<1>;
+ def SCR3RV64_LSU : ProcResource<1>;
+ def SCR3RV64_CFU : ProcResource<1>;
+ }
+
+ defm : SCR3_Branching<SCR3RV64_CFU>;
+ defm : SCR3_IntALU<SCR3RV64_ALU>;
+ defm : SCR3_IntMul<SCR3RV64_MUL>;
+ defm : SCR3_IntDiv<SCR3RV64_DIV, /* div latency = */ 11>;
+ defm : SCR3_Memory<SCR3RV64_LSU>;
+ defm : SCR3_AtomicMemory<SCR3RV64_LSU>;
+ defm : SCR3_Other;
+
+ defm : SCR3_Unsupported;
+ defm : SCR3_NoReadAdvances;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index d9a2e38c0e9d..1fdbc7cbcbaf 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -257,6 +257,90 @@ def : ReadAdvance<ReadFSqrt16, 0>;
} // Unsupported = true
}
+multiclass UnsupportedSchedF {
+let Unsupported = true in {
+def : WriteRes<WriteFST32, []>;
+def : WriteRes<WriteFLD32, []>;
+def : WriteRes<WriteFAdd32, []>;
+def : WriteRes<WriteFSGNJ32, []>;
+def : WriteRes<WriteFMinMax32, []>;
+def : WriteRes<WriteFCvtI32ToF32, []>;
+def : WriteRes<WriteFCvtI64ToF32, []>;
+def : WriteRes<WriteFCvtF32ToI32, []>;
+def : WriteRes<WriteFCvtF32ToI64, []>;
+def : WriteRes<WriteFClass32, []>;
+def : WriteRes<WriteFCmp32, []>;
+def : WriteRes<WriteFMovF32ToI32, []>;
+def : WriteRes<WriteFMovI32ToF32, []>;
+def : WriteRes<WriteFMul32, []>;
+def : WriteRes<WriteFMA32, []>;
+def : WriteRes<WriteFDiv32, []>;
+def : WriteRes<WriteFSqrt32, []>;
+
+def : ReadAdvance<ReadFAdd32, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFStoreData, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+} // Unsupported = true
+}
+
+multiclass UnsupportedSchedD {
+let Unsupported = true in {
+def : WriteRes<WriteFST64, []>;
+def : WriteRes<WriteFLD64, []>;
+def : WriteRes<WriteFAdd64, []>;
+def : WriteRes<WriteFSGNJ64, []>;
+def : WriteRes<WriteFMinMax64, []>;
+def : WriteRes<WriteFCvtI32ToF64, []>;
+def : WriteRes<WriteFCvtI64ToF64, []>;
+def : WriteRes<WriteFCvtF64ToI32, []>;
+def : WriteRes<WriteFCvtF64ToI64, []>;
+def : WriteRes<WriteFCvtF32ToF64, []>;
+def : WriteRes<WriteFCvtF64ToF32, []>;
+def : WriteRes<WriteFClass64, []>;
+def : WriteRes<WriteFCmp64, []>;
+def : WriteRes<WriteFMovF64ToI64, []>;
+def : WriteRes<WriteFMovI64ToF64, []>;
+def : WriteRes<WriteFMul64, []>;
+def : WriteRes<WriteFMA64, []>;
+def : WriteRes<WriteFDiv64, []>;
+def : WriteRes<WriteFSqrt64, []>;
+
+def : ReadAdvance<ReadFAdd64, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+} // Unsupported = true
+}
+
multiclass UnsupportedSchedSFB {
let Unsupported = true in {
def : WriteRes<WriteSFB, []>;
@@ -293,6 +377,26 @@ def : ReadAdvance<ReadAtomicHD, 0>;
} // Unsupported = true
}
+multiclass UnsupportedSchedA {
+let Unsupported = true in {
+def : WriteRes<WriteAtomicW, []>;
+def : WriteRes<WriteAtomicD, []>;
+def : WriteRes<WriteAtomicLDW, []>;
+def : WriteRes<WriteAtomicLDD, []>;
+def : WriteRes<WriteAtomicSTW, []>;
+def : WriteRes<WriteAtomicSTD, []>;
+
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+} // Unsupported = true
+}
+
// Include the scheduler resources for other instruction extensions.
include "RISCVScheduleZb.td"
include "RISCVScheduleV.td"
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index d3236bb07d56..e84ddc65e2b7 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -39,9 +39,6 @@ namespace llvm::RISCVTuneInfoTable {
#include "RISCVGenSearchableTables.inc"
} // namespace llvm::RISCVTuneInfoTable
-static cl::opt<bool> EnableSubRegLiveness("riscv-enable-subreg-liveness",
- cl::init(true), cl::Hidden);
-
static cl::opt<unsigned> RVVVectorLMULMax(
"riscv-v-fixed-length-vector-lmul-max",
cl::desc("The maximum LMUL value to use for fixed length vectors. "
@@ -183,11 +180,7 @@ bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
return hasVInstructions() && getMinRVVVectorSizeInBits() != 0;
}
-bool RISCVSubtarget::enableSubRegLiveness() const {
- // FIXME: Enable subregister liveness by default for RVV to better handle
- // LMUL>1 and segment load/store.
- return EnableSubRegLiveness;
-}
+bool RISCVSubtarget::enableSubRegLiveness() const { return true; }
void RISCVSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 01c276711950..db840b302749 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -311,6 +311,11 @@ foreach i = 3...31 in {
}
//===----------------------------------------------------------------------===//
+// Supervisor Counter Setup
+//===----------------------------------------------------------------------===//
+def : SysReg<"scountinhibit", 0x120>;
+
+//===----------------------------------------------------------------------===//
// Debug/ Trace Registers (shared with Debug Mode)
//===----------------------------------------------------------------------===//
def : SysReg<"tselect", 0x7A0>;
@@ -377,6 +382,12 @@ def SEED : SysReg<"seed", 0x015>;
// Machine-level CSRs
def : SysReg<"miselect", 0x350>;
def : SysReg<"mireg", 0x351>;
+foreach i = 2...3 in {
+ def : SysReg<"mireg"#i, !add(0x350, i)>;
+}
+foreach i = 4...6 in {
+ def : SysReg<"mireg"#i, !add(0x351, i)>;
+}
def : SysReg<"mtopei", 0x35C>;
def : SysReg<"mtopi", 0xFB0>;
def : SysReg<"mvien", 0x308>;
@@ -392,6 +403,12 @@ def : SysReg<"miph", 0x354>;
// Supervisor-level CSRs
def : SysReg<"siselect", 0x150>;
def : SysReg<"sireg", 0x151>;
+foreach i = 2...3 in {
+ def : SysReg<"sireg"#i, !add(0x150, i)>;
+}
+foreach i = 4...6 in {
+ def : SysReg<"sireg"#i, !add(0x151, i)>;
+}
def : SysReg<"stopei", 0x15C>;
def : SysReg<"stopi", 0xDB0>;
let isRV32Only = 1 in {
@@ -406,6 +423,12 @@ def : SysReg<"hviprio1", 0x646>;
def : SysReg<"hviprio2", 0x647>;
def : SysReg<"vsiselect", 0x250>;
def : SysReg<"vsireg", 0x251>;
+foreach i = 2...3 in {
+ def : SysReg<"vsireg"#i, !add(0x250, i)>;
+}
+foreach i = 4...6 in {
+ def : SysReg<"vsireg"#i, !add(0x251, i)>;
+}
def : SysReg<"vstopei", 0x25C>;
def : SysReg<"vstopi", 0xEB0>;
let isRV32Only = 1 in {
diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp
index 7f5f7d0b1e4d..25e285e35f93 100644
--- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp
@@ -138,7 +138,7 @@ ConvergenceRegion::ConvergenceRegion(
SmallPtrSet<BasicBlock *, 8> &&Blocks, SmallPtrSet<BasicBlock *, 2> &&Exits)
: DT(DT), LI(LI), ConvergenceToken(ConvergenceToken), Entry(Entry),
Exits(std::move(Exits)), Blocks(std::move(Blocks)) {
- for (auto *BB : this->Exits)
+ for ([[maybe_unused]] auto *BB : this->Exits)
assert(this->Blocks.count(BB) != 0);
assert(this->Blocks.count(this->Entry) != 0);
}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
index d96d2bf31b62..0f9a2a69e073 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
@@ -198,6 +198,8 @@ std::string getExtInstSetName(SPIRV::InstructionSet::InstructionSet Set) {
return "OpenCL.std";
case SPIRV::InstructionSet::GLSL_std_450:
return "GLSL.std.450";
+ case SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100:
+ return "NonSemantic.Shader.DebugInfo.100";
case SPIRV::InstructionSet::SPV_AMD_shader_trinary_minmax:
return "SPV_AMD_shader_trinary_minmax";
}
@@ -206,8 +208,9 @@ std::string getExtInstSetName(SPIRV::InstructionSet::InstructionSet Set) {
SPIRV::InstructionSet::InstructionSet
getExtInstSetFromString(std::string SetName) {
- for (auto Set : {SPIRV::InstructionSet::GLSL_std_450,
- SPIRV::InstructionSet::OpenCL_std}) {
+ for (auto Set :
+ {SPIRV::InstructionSet::GLSL_std_450, SPIRV::InstructionSet::OpenCL_std,
+ SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100}) {
if (SetName == getExtInstSetName(Set))
return Set;
}
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index 990eb1d230bc..44625793e941 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -197,6 +197,11 @@ namespace GLSLExtInst {
#include "SPIRVGenTables.inc"
} // namespace GLSLExtInst
+namespace NonSemanticExtInst {
+#define GET_NonSemanticExtInst_DECL
+#include "SPIRVGenTables.inc"
+} // namespace NonSemanticExtInst
+
namespace Opcode {
#define GET_Opcode_DECL
#include "SPIRVGenTables.inc"
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h
index 842958695e10..a6dd7138edf3 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h
@@ -21,7 +21,7 @@ public:
~SPIRVTargetStreamer() override;
void changeSection(const MCSection *CurSection, MCSection *Section,
- const MCExpr *SubSection, raw_ostream &OS) override {}
+ uint32_t SubSection, raw_ostream &OS) override {}
};
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 49838e685a6d..0b93a4d85eed 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -300,6 +300,72 @@ lookupBuiltin(StringRef DemangledCall,
return nullptr;
}
+static MachineInstr *getBlockStructInstr(Register ParamReg,
+ MachineRegisterInfo *MRI) {
+ // We expect the following sequence of instructions:
+ // %0:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.alloca)
+ // or = G_GLOBAL_VALUE @block_literal_global
+ // %1:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.bitcast), %0
+ // %2:_(p4) = G_ADDRSPACE_CAST %1:_(pN)
+ MachineInstr *MI = MRI->getUniqueVRegDef(ParamReg);
+ assert(MI->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST &&
+ MI->getOperand(1).isReg());
+ Register BitcastReg = MI->getOperand(1).getReg();
+ MachineInstr *BitcastMI = MRI->getUniqueVRegDef(BitcastReg);
+ assert(isSpvIntrinsic(*BitcastMI, Intrinsic::spv_bitcast) &&
+ BitcastMI->getOperand(2).isReg());
+ Register ValueReg = BitcastMI->getOperand(2).getReg();
+ MachineInstr *ValueMI = MRI->getUniqueVRegDef(ValueReg);
+ return ValueMI;
+}
+
+// Return an integer constant corresponding to the given register and
+// defined in spv_track_constant.
+// TODO: maybe unify with prelegalizer pass.
+static unsigned getConstFromIntrinsic(Register Reg, MachineRegisterInfo *MRI) {
+ MachineInstr *DefMI = MRI->getUniqueVRegDef(Reg);
+ assert(isSpvIntrinsic(*DefMI, Intrinsic::spv_track_constant) &&
+ DefMI->getOperand(2).isReg());
+ MachineInstr *DefMI2 = MRI->getUniqueVRegDef(DefMI->getOperand(2).getReg());
+ assert(DefMI2->getOpcode() == TargetOpcode::G_CONSTANT &&
+ DefMI2->getOperand(1).isCImm());
+ return DefMI2->getOperand(1).getCImm()->getValue().getZExtValue();
+}
+
+// Return type of the instruction result from spv_assign_type intrinsic.
+// TODO: maybe unify with prelegalizer pass.
+static const Type *getMachineInstrType(MachineInstr *MI) {
+ MachineInstr *NextMI = MI->getNextNode();
+ if (!NextMI)
+ return nullptr;
+ if (isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_name))
+ if ((NextMI = NextMI->getNextNode()) == nullptr)
+ return nullptr;
+ Register ValueReg = MI->getOperand(0).getReg();
+ if ((!isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_type) &&
+ !isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_ptr_type)) ||
+ NextMI->getOperand(1).getReg() != ValueReg)
+ return nullptr;
+ Type *Ty = getMDOperandAsType(NextMI->getOperand(2).getMetadata(), 0);
+ assert(Ty && "Type is expected");
+ return Ty;
+}
+
+static const Type *getBlockStructType(Register ParamReg,
+ MachineRegisterInfo *MRI) {
+ // In principle, this information should be passed to us from Clang via
+ // an elementtype attribute. However, said attribute requires that
+ // the function call be an intrinsic, which is not. Instead, we rely on being
+ // able to trace this to the declaration of a variable: OpenCL C specification
+ // section 6.12.5 should guarantee that we can do this.
+ MachineInstr *MI = getBlockStructInstr(ParamReg, MRI);
+ if (MI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE)
+ return MI->getOperand(1).getGlobal()->getType();
+ assert(isSpvIntrinsic(*MI, Intrinsic::spv_alloca) &&
+ "Blocks in OpenCL C must be traceable to allocation site");
+ return getMachineInstrType(MI);
+}
+
//===----------------------------------------------------------------------===//
// Helper functions for building misc instructions
//===----------------------------------------------------------------------===//
@@ -492,16 +558,21 @@ static Register buildMemSemanticsReg(Register SemanticsRegister,
static bool buildOpFromWrapper(MachineIRBuilder &MIRBuilder, unsigned Opcode,
const SPIRV::IncomingCall *Call,
- Register TypeReg = Register(0)) {
+ Register TypeReg,
+ ArrayRef<uint32_t> ImmArgs = {}) {
MachineRegisterInfo *MRI = MIRBuilder.getMRI();
auto MIB = MIRBuilder.buildInstr(Opcode);
if (TypeReg.isValid())
MIB.addDef(Call->ReturnRegister).addUse(TypeReg);
- for (Register ArgReg : Call->Arguments) {
+ unsigned Sz = Call->Arguments.size() - ImmArgs.size();
+ for (unsigned i = 0; i < Sz; ++i) {
+ Register ArgReg = Call->Arguments[i];
if (!MRI->getRegClassOrNull(ArgReg))
MRI->setRegClass(ArgReg, &SPIRV::IDRegClass);
MIB.addUse(ArgReg);
}
+ for (uint32_t ImmArg : ImmArgs)
+ MIB.addImm(ImmArg);
return true;
}
@@ -509,7 +580,7 @@ static bool buildOpFromWrapper(MachineIRBuilder &MIRBuilder, unsigned Opcode,
static bool buildAtomicInitInst(const SPIRV::IncomingCall *Call,
MachineIRBuilder &MIRBuilder) {
if (Call->isSpirvOp())
- return buildOpFromWrapper(MIRBuilder, SPIRV::OpStore, Call);
+ return buildOpFromWrapper(MIRBuilder, SPIRV::OpStore, Call, Register(0));
assert(Call->Arguments.size() == 2 &&
"Need 2 arguments for atomic init translation");
@@ -567,7 +638,7 @@ static bool buildAtomicStoreInst(const SPIRV::IncomingCall *Call,
MachineIRBuilder &MIRBuilder,
SPIRVGlobalRegistry *GR) {
if (Call->isSpirvOp())
- return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call);
+ return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call, Register(0));
Register ScopeRegister =
buildConstantIntReg(SPIRV::Scope::Device, MIRBuilder, GR);
@@ -694,7 +765,7 @@ static bool buildAtomicCompareExchangeInst(
return true;
}
-/// Helper function for building an atomic load instruction.
+/// Helper function for building atomic instructions.
static bool buildAtomicRMWInst(const SPIRV::IncomingCall *Call, unsigned Opcode,
MachineIRBuilder &MIRBuilder,
SPIRVGlobalRegistry *GR) {
@@ -719,13 +790,36 @@ static bool buildAtomicRMWInst(const SPIRV::IncomingCall *Call, unsigned Opcode,
MemSemanticsReg = buildMemSemanticsReg(MemSemanticsReg, PtrRegister,
Semantics, MIRBuilder, GR);
MRI->setRegClass(Call->Arguments[1], &SPIRV::IDRegClass);
+ Register ValueReg = Call->Arguments[1];
+ Register ValueTypeReg = GR->getSPIRVTypeID(Call->ReturnType);
+ // support cl_ext_float_atomics
+ if (Call->ReturnType->getOpcode() == SPIRV::OpTypeFloat) {
+ if (Opcode == SPIRV::OpAtomicIAdd) {
+ Opcode = SPIRV::OpAtomicFAddEXT;
+ } else if (Opcode == SPIRV::OpAtomicISub) {
+ // Translate OpAtomicISub applied to a floating type argument to
+ // OpAtomicFAddEXT with the negative value operand
+ Opcode = SPIRV::OpAtomicFAddEXT;
+ Register NegValueReg =
+ MRI->createGenericVirtualRegister(MRI->getType(ValueReg));
+ MRI->setRegClass(NegValueReg, &SPIRV::IDRegClass);
+ GR->assignSPIRVTypeToVReg(Call->ReturnType, NegValueReg,
+ MIRBuilder.getMF());
+ MIRBuilder.buildInstr(TargetOpcode::G_FNEG)
+ .addDef(NegValueReg)
+ .addUse(ValueReg);
+ insertAssignInstr(NegValueReg, nullptr, Call->ReturnType, GR, MIRBuilder,
+ MIRBuilder.getMF().getRegInfo());
+ ValueReg = NegValueReg;
+ }
+ }
MIRBuilder.buildInstr(Opcode)
.addDef(Call->ReturnRegister)
- .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+ .addUse(ValueTypeReg)
.addUse(PtrRegister)
.addUse(ScopeRegister)
.addUse(MemSemanticsReg)
- .addUse(Call->Arguments[1]);
+ .addUse(ValueReg);
return true;
}
@@ -804,7 +898,7 @@ static bool buildBarrierInst(const SPIRV::IncomingCall *Call, unsigned Opcode,
MachineIRBuilder &MIRBuilder,
SPIRVGlobalRegistry *GR) {
if (Call->isSpirvOp())
- return buildOpFromWrapper(MIRBuilder, Opcode, Call);
+ return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
MachineRegisterInfo *MRI = MIRBuilder.getMRI();
unsigned MemFlags = getIConstVal(Call->Arguments[0], MRI);
@@ -949,7 +1043,35 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call,
const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
const SPIRV::GroupBuiltin *GroupBuiltin =
SPIRV::lookupGroupBuiltin(Builtin->Name);
+
MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+ if (Call->isSpirvOp()) {
+ if (GroupBuiltin->NoGroupOperation)
+ return buildOpFromWrapper(MIRBuilder, GroupBuiltin->Opcode, Call,
+ GR->getSPIRVTypeID(Call->ReturnType));
+
+ // Group Operation is a literal
+ Register GroupOpReg = Call->Arguments[1];
+ const MachineInstr *MI = getDefInstrMaybeConstant(GroupOpReg, MRI);
+ if (!MI || MI->getOpcode() != TargetOpcode::G_CONSTANT)
+ report_fatal_error(
+ "Group Operation parameter must be an integer constant");
+ uint64_t GrpOp = MI->getOperand(1).getCImm()->getValue().getZExtValue();
+ Register ScopeReg = Call->Arguments[0];
+ if (!MRI->getRegClassOrNull(ScopeReg))
+ MRI->setRegClass(ScopeReg, &SPIRV::IDRegClass);
+ Register ValueReg = Call->Arguments[2];
+ if (!MRI->getRegClassOrNull(ValueReg))
+ MRI->setRegClass(ValueReg, &SPIRV::IDRegClass);
+ MIRBuilder.buildInstr(GroupBuiltin->Opcode)
+ .addDef(Call->ReturnRegister)
+ .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+ .addUse(ScopeReg)
+ .addImm(GrpOp)
+ .addUse(ValueReg);
+ return true;
+ }
+
Register Arg0;
if (GroupBuiltin->HasBoolArg) {
Register ConstRegister = Call->Arguments[0];
@@ -1371,6 +1493,14 @@ static bool generateBarrierInst(const SPIRV::IncomingCall *Call,
return buildBarrierInst(Call, Opcode, MIRBuilder, GR);
}
+static bool generateCastToPtrInst(const SPIRV::IncomingCall *Call,
+ MachineIRBuilder &MIRBuilder) {
+ MIRBuilder.buildInstr(TargetOpcode::G_ADDRSPACE_CAST)
+ .addDef(Call->ReturnRegister)
+ .addUse(Call->Arguments[0]);
+ return true;
+}
+
static bool generateDotOrFMulInst(const SPIRV::IncomingCall *Call,
MachineIRBuilder &MIRBuilder,
SPIRVGlobalRegistry *GR) {
@@ -1722,6 +1852,45 @@ static bool generateSelectInst(const SPIRV::IncomingCall *Call,
return true;
}
+static bool generateConstructInst(const SPIRV::IncomingCall *Call,
+ MachineIRBuilder &MIRBuilder,
+ SPIRVGlobalRegistry *GR) {
+ return buildOpFromWrapper(MIRBuilder, SPIRV::OpCompositeConstruct, Call,
+ GR->getSPIRVTypeID(Call->ReturnType));
+}
+
+static bool generateCoopMatrInst(const SPIRV::IncomingCall *Call,
+ MachineIRBuilder &MIRBuilder,
+ SPIRVGlobalRegistry *GR) {
+ const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+ unsigned Opcode =
+ SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+ bool IsSet = Opcode != SPIRV::OpCooperativeMatrixStoreKHR;
+ unsigned ArgSz = Call->Arguments.size();
+ unsigned LiteralIdx = 0;
+ if (Opcode == SPIRV::OpCooperativeMatrixLoadKHR && ArgSz > 3)
+ LiteralIdx = 3;
+ else if (Opcode == SPIRV::OpCooperativeMatrixStoreKHR && ArgSz > 4)
+ LiteralIdx = 4;
+ SmallVector<uint32_t, 1> ImmArgs;
+ MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+ if (LiteralIdx > 0)
+ ImmArgs.push_back(getConstFromIntrinsic(Call->Arguments[LiteralIdx], MRI));
+ Register TypeReg = GR->getSPIRVTypeID(Call->ReturnType);
+ if (Opcode == SPIRV::OpCooperativeMatrixLengthKHR) {
+ SPIRVType *CoopMatrType = GR->getSPIRVTypeForVReg(Call->Arguments[0]);
+ if (!CoopMatrType)
+ report_fatal_error("Can't find a register's type definition");
+ MIRBuilder.buildInstr(Opcode)
+ .addDef(Call->ReturnRegister)
+ .addUse(TypeReg)
+ .addUse(CoopMatrType->getOperand(0).getReg());
+ return true;
+ }
+ return buildOpFromWrapper(MIRBuilder, Opcode, Call,
+ IsSet ? TypeReg : Register(0), ImmArgs);
+}
+
static bool generateSpecConstantInst(const SPIRV::IncomingCall *Call,
MachineIRBuilder &MIRBuilder,
SPIRVGlobalRegistry *GR) {
@@ -1826,7 +1995,10 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call,
.addDef(GlobalWorkSize)
.addUse(GR->getSPIRVTypeID(SpvFieldTy))
.addUse(GWSPtr);
- Const = GR->getOrCreateConsIntArray(0, MIRBuilder, SpvFieldTy);
+ const SPIRVSubtarget &ST =
+ cast<SPIRVSubtarget>(MIRBuilder.getMF().getSubtarget());
+ Const = GR->getOrCreateConstIntArray(0, Size, *MIRBuilder.getInsertPt(),
+ SpvFieldTy, *ST.getInstrInfo());
} else {
Const = GR->buildConstantInt(0, MIRBuilder, SpvTy);
}
@@ -1847,68 +2019,6 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call,
.addUse(TmpReg);
}
-static MachineInstr *getBlockStructInstr(Register ParamReg,
- MachineRegisterInfo *MRI) {
- // We expect the following sequence of instructions:
- // %0:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.alloca)
- // or = G_GLOBAL_VALUE @block_literal_global
- // %1:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.bitcast), %0
- // %2:_(p4) = G_ADDRSPACE_CAST %1:_(pN)
- MachineInstr *MI = MRI->getUniqueVRegDef(ParamReg);
- assert(MI->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST &&
- MI->getOperand(1).isReg());
- Register BitcastReg = MI->getOperand(1).getReg();
- MachineInstr *BitcastMI = MRI->getUniqueVRegDef(BitcastReg);
- assert(isSpvIntrinsic(*BitcastMI, Intrinsic::spv_bitcast) &&
- BitcastMI->getOperand(2).isReg());
- Register ValueReg = BitcastMI->getOperand(2).getReg();
- MachineInstr *ValueMI = MRI->getUniqueVRegDef(ValueReg);
- return ValueMI;
-}
-
-// Return an integer constant corresponding to the given register and
-// defined in spv_track_constant.
-// TODO: maybe unify with prelegalizer pass.
-static unsigned getConstFromIntrinsic(Register Reg, MachineRegisterInfo *MRI) {
- MachineInstr *DefMI = MRI->getUniqueVRegDef(Reg);
- assert(isSpvIntrinsic(*DefMI, Intrinsic::spv_track_constant) &&
- DefMI->getOperand(2).isReg());
- MachineInstr *DefMI2 = MRI->getUniqueVRegDef(DefMI->getOperand(2).getReg());
- assert(DefMI2->getOpcode() == TargetOpcode::G_CONSTANT &&
- DefMI2->getOperand(1).isCImm());
- return DefMI2->getOperand(1).getCImm()->getValue().getZExtValue();
-}
-
-// Return type of the instruction result from spv_assign_type intrinsic.
-// TODO: maybe unify with prelegalizer pass.
-static const Type *getMachineInstrType(MachineInstr *MI) {
- MachineInstr *NextMI = MI->getNextNode();
- if (isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_name))
- NextMI = NextMI->getNextNode();
- Register ValueReg = MI->getOperand(0).getReg();
- if (!isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_type) ||
- NextMI->getOperand(1).getReg() != ValueReg)
- return nullptr;
- Type *Ty = getMDOperandAsType(NextMI->getOperand(2).getMetadata(), 0);
- assert(Ty && "Type is expected");
- return Ty;
-}
-
-static const Type *getBlockStructType(Register ParamReg,
- MachineRegisterInfo *MRI) {
- // In principle, this information should be passed to us from Clang via
- // an elementtype attribute. However, said attribute requires that
- // the function call be an intrinsic, which is not. Instead, we rely on being
- // able to trace this to the declaration of a variable: OpenCL C specification
- // section 6.12.5 should guarantee that we can do this.
- MachineInstr *MI = getBlockStructInstr(ParamReg, MRI);
- if (MI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE)
- return MI->getOperand(1).getGlobal()->getType();
- assert(isSpvIntrinsic(*MI, Intrinsic::spv_alloca) &&
- "Blocks in OpenCL C must be traceable to allocation site");
- return getMachineInstrType(MI);
-}
-
// TODO: maybe move to the global register.
static SPIRVType *
getOrCreateSPIRVDeviceEventPointer(MachineIRBuilder &MIRBuilder,
@@ -2322,6 +2432,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
return generateAtomicFloatingInst(Call.get(), MIRBuilder, GR);
case SPIRV::Barrier:
return generateBarrierInst(Call.get(), MIRBuilder, GR);
+ case SPIRV::CastToPtr:
+ return generateCastToPtrInst(Call.get(), MIRBuilder);
case SPIRV::Dot:
return generateDotOrFMulInst(Call.get(), MIRBuilder, GR);
case SPIRV::Wave:
@@ -2340,6 +2452,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
return generateSampleImageInst(DemangledCall, Call.get(), MIRBuilder, GR);
case SPIRV::Select:
return generateSelectInst(Call.get(), MIRBuilder);
+ case SPIRV::Construct:
+ return generateConstructInst(Call.get(), MIRBuilder, GR);
case SPIRV::SpecConstant:
return generateSpecConstantInst(Call.get(), MIRBuilder, GR);
case SPIRV::Enqueue:
@@ -2358,6 +2472,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
return generateGroupUniformInst(Call.get(), MIRBuilder, GR);
case SPIRV::KernelClock:
return generateKernelClockInst(Call.get(), MIRBuilder, GR);
+ case SPIRV::CoopMatr:
+ return generateCoopMatrInst(Call.get(), MIRBuilder, GR);
}
return false;
}
@@ -2376,7 +2492,7 @@ Type *parseBuiltinCallArgumentBaseType(const StringRef DemangledCall,
if (hasBuiltinTypePrefix(TypeStr)) {
// OpenCL builtin types in demangled call strings have the following format:
// e.g. ocl_image2d_ro
- bool IsOCLBuiltinType = TypeStr.consume_front("ocl_");
+ [[maybe_unused]] bool IsOCLBuiltinType = TypeStr.consume_front("ocl_");
assert(IsOCLBuiltinType && "Invalid OpenCL builtin prefix");
// Check if this is pointer to a builtin type and not just pointer
@@ -2482,6 +2598,22 @@ static SPIRVType *getPipeType(const TargetExtType *ExtensionType,
ExtensionType->getIntParameter(0)));
}
+static SPIRVType *getCoopMatrType(const TargetExtType *ExtensionType,
+ MachineIRBuilder &MIRBuilder,
+ SPIRVGlobalRegistry *GR) {
+ assert(ExtensionType->getNumIntParameters() == 4 &&
+ "Invalid number of parameters for SPIR-V coop matrices builtin!");
+ assert(ExtensionType->getNumTypeParameters() == 1 &&
+ "SPIR-V coop matrices builtin type must have a type parameter!");
+ const SPIRVType *ElemType =
+ GR->getOrCreateSPIRVType(ExtensionType->getTypeParameter(0), MIRBuilder);
+ // Create or get an existing type from GlobalRegistry.
+ return GR->getOrCreateOpTypeCoopMatr(
+ MIRBuilder, ExtensionType, ElemType, ExtensionType->getIntParameter(0),
+ ExtensionType->getIntParameter(1), ExtensionType->getIntParameter(2),
+ ExtensionType->getIntParameter(3));
+}
+
static SPIRVType *
getImageType(const TargetExtType *ExtensionType,
const SPIRV::AccessQualifier::AccessQualifier Qualifier,
@@ -2612,6 +2744,9 @@ SPIRVType *lowerBuiltinType(const Type *OpaqueType,
case SPIRV::OpTypeSampledImage:
TargetType = getSampledImageType(BuiltinType, MIRBuilder, GR);
break;
+ case SPIRV::OpTypeCooperativeMatrixKHR:
+ TargetType = getCoopMatrType(BuiltinType, MIRBuilder, GR);
+ break;
default:
TargetType =
getNonParameterizedType(BuiltinType, TypeRecord, MIRBuilder, GR);
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index edc9e1a33d9f..fb88332ab890 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -26,6 +26,7 @@ class InstructionSet<bits<32> value> {
def OpenCL_std : InstructionSet<0>;
def GLSL_std_450 : InstructionSet<1>;
def SPV_AMD_shader_trinary_minmax : InstructionSet<2>;
+def NonSemantic_Shader_DebugInfo_100 : InstructionSet<3>;
// Define various builtin groups
def BuiltinGroup : GenericEnum {
@@ -59,6 +60,9 @@ def IntelSubgroups : BuiltinGroup;
def AtomicFloating : BuiltinGroup;
def GroupUniform : BuiltinGroup;
def KernelClock : BuiltinGroup;
+def CastToPtr : BuiltinGroup;
+def Construct : BuiltinGroup;
+def CoopMatr : BuiltinGroup;
//===----------------------------------------------------------------------===//
// Class defining a demangled builtin record. The information in the record
@@ -113,6 +117,9 @@ def : DemangledBuiltin<"__spirv_ImageSampleExplicitLod", OpenCL_std, SampleImage
// Select builtin record:
def : DemangledBuiltin<"__spirv_Select", OpenCL_std, Select, 3, 3>;
+// Composite Construct builtin record:
+def : DemangledBuiltin<"__spirv_CompositeConstruct", OpenCL_std, Construct, 1, 0>;
+
//===----------------------------------------------------------------------===//
// Class defining an extended builtin record used for lowering into an
// OpExtInst instruction.
@@ -170,6 +177,17 @@ class GLSLExtInst<string name, bits<32> value> {
bits<32> Value = value;
}
+def NonSemanticExtInst : GenericEnum {
+ let FilterClass = "NonSemanticExtInst";
+ let NameField = "Name";
+ let ValueField = "Value";
+}
+
+class NonSemanticExtInst<string name, bits<32> value> {
+ string Name = name;
+ bits<32> Value = value;
+}
+
// Multiclass used to define at the same time both a demangled builtin record
// and a corresponding extended builtin record.
multiclass DemangledExtendedBuiltin<string name, InstructionSet set, int number> {
@@ -183,6 +201,10 @@ multiclass DemangledExtendedBuiltin<string name, InstructionSet set, int number>
if !eq(set, GLSL_std_450) then {
def : GLSLExtInst<name, number>;
}
+
+ if !eq(set, NonSemantic_Shader_DebugInfo_100) then {
+ def : NonSemanticExtInst<name, number>;
+ }
}
// Extended builtin records:
@@ -430,6 +452,50 @@ defm : DemangledExtendedBuiltin<"NMin", GLSL_std_450, 79>;
defm : DemangledExtendedBuiltin<"NMax", GLSL_std_450, 80>;
defm : DemangledExtendedBuiltin<"NClamp", GLSL_std_450, 81>;
+defm : DemangledExtendedBuiltin<"DebugInfoNone", NonSemantic_Shader_DebugInfo_100, 0>;
+defm : DemangledExtendedBuiltin<"DebugCompilationUnit", NonSemantic_Shader_DebugInfo_100, 1>;
+defm : DemangledExtendedBuiltin<"DebugTypeBasic", NonSemantic_Shader_DebugInfo_100, 2>;
+defm : DemangledExtendedBuiltin<"DebugTypePointer", NonSemantic_Shader_DebugInfo_100, 3>;
+defm : DemangledExtendedBuiltin<"DebugTypeQualifier", NonSemantic_Shader_DebugInfo_100, 4>;
+defm : DemangledExtendedBuiltin<"DebugTypeArray", NonSemantic_Shader_DebugInfo_100, 5>;
+defm : DemangledExtendedBuiltin<"DebugTypeVector", NonSemantic_Shader_DebugInfo_100, 6>;
+defm : DemangledExtendedBuiltin<"DebugTypedef", NonSemantic_Shader_DebugInfo_100, 7>;
+defm : DemangledExtendedBuiltin<"DebugTypeFunction", NonSemantic_Shader_DebugInfo_100, 8>;
+defm : DemangledExtendedBuiltin<"DebugTypeEnum", NonSemantic_Shader_DebugInfo_100, 9>;
+defm : DemangledExtendedBuiltin<"DebugTypeComposite", NonSemantic_Shader_DebugInfo_100, 10>;
+defm : DemangledExtendedBuiltin<"DebugTypeMember", NonSemantic_Shader_DebugInfo_100, 11>;
+defm : DemangledExtendedBuiltin<"DebugTypeInheritance", NonSemantic_Shader_DebugInfo_100, 12>;
+defm : DemangledExtendedBuiltin<"DebugTypePtrToMember", NonSemantic_Shader_DebugInfo_100, 13>;
+defm : DemangledExtendedBuiltin<"DebugTypeTemplate", NonSemantic_Shader_DebugInfo_100, 14>;
+defm : DemangledExtendedBuiltin<"DebugTypeTemplateParameter", NonSemantic_Shader_DebugInfo_100, 15>;
+defm : DemangledExtendedBuiltin<"DebugTypeTemplateTemplateParameter", NonSemantic_Shader_DebugInfo_100, 16>;
+defm : DemangledExtendedBuiltin<"DebugTypeTemplateParameterPack", NonSemantic_Shader_DebugInfo_100, 17>;
+defm : DemangledExtendedBuiltin<"DebugGlobalVariable", NonSemantic_Shader_DebugInfo_100, 18>;
+defm : DemangledExtendedBuiltin<"DebugFunctionDeclaration", NonSemantic_Shader_DebugInfo_100, 19>;
+defm : DemangledExtendedBuiltin<"DebugFunction", NonSemantic_Shader_DebugInfo_100, 20>;
+defm : DemangledExtendedBuiltin<"DebugLexicalBlock", NonSemantic_Shader_DebugInfo_100, 21>;
+defm : DemangledExtendedBuiltin<"DebugLexicalBlockDiscriminator", NonSemantic_Shader_DebugInfo_100, 22>;
+defm : DemangledExtendedBuiltin<"DebugScope", NonSemantic_Shader_DebugInfo_100, 23>;
+defm : DemangledExtendedBuiltin<"DebugNoScope", NonSemantic_Shader_DebugInfo_100, 24>;
+defm : DemangledExtendedBuiltin<"DebugInlinedAt", NonSemantic_Shader_DebugInfo_100, 25>;
+defm : DemangledExtendedBuiltin<"DebugLocalVariable", NonSemantic_Shader_DebugInfo_100, 26>;
+defm : DemangledExtendedBuiltin<"DebugInlinedVariable", NonSemantic_Shader_DebugInfo_100, 27>;
+defm : DemangledExtendedBuiltin<"DebugDeclare", NonSemantic_Shader_DebugInfo_100, 28>;
+defm : DemangledExtendedBuiltin<"DebugValue", NonSemantic_Shader_DebugInfo_100, 29>;
+defm : DemangledExtendedBuiltin<"DebugOperation", NonSemantic_Shader_DebugInfo_100, 30>;
+defm : DemangledExtendedBuiltin<"DebugExpression", NonSemantic_Shader_DebugInfo_100, 31>;
+defm : DemangledExtendedBuiltin<"DebugMacroDef", NonSemantic_Shader_DebugInfo_100, 32>;
+defm : DemangledExtendedBuiltin<"DebugMacroUndef", NonSemantic_Shader_DebugInfo_100, 33>;
+defm : DemangledExtendedBuiltin<"DebugImportedEntity", NonSemantic_Shader_DebugInfo_100, 34>;
+defm : DemangledExtendedBuiltin<"DebugSource", NonSemantic_Shader_DebugInfo_100, 35>;
+defm : DemangledExtendedBuiltin<"DebugFunctionDefinition", NonSemantic_Shader_DebugInfo_100, 101>;
+defm : DemangledExtendedBuiltin<"DebugSourceContinued", NonSemantic_Shader_DebugInfo_100, 102>;
+defm : DemangledExtendedBuiltin<"DebugLine", NonSemantic_Shader_DebugInfo_100, 103>;
+defm : DemangledExtendedBuiltin<"DebugNoLine", NonSemantic_Shader_DebugInfo_100, 104>;
+defm : DemangledExtendedBuiltin<"DebugBuildIdentifier", NonSemantic_Shader_DebugInfo_100, 105>;
+defm : DemangledExtendedBuiltin<"DebugStoragePath", NonSemantic_Shader_DebugInfo_100, 106>;
+defm : DemangledExtendedBuiltin<"DebugEntryPoint", NonSemantic_Shader_DebugInfo_100, 107>;
+defm : DemangledExtendedBuiltin<"DebugTypeMatrix", NonSemantic_Shader_DebugInfo_100, 108>;
//===----------------------------------------------------------------------===//
// Class defining an native builtin record used for direct translation into a
// SPIR-V instruction.
@@ -532,6 +598,7 @@ defm : DemangledNativeBuiltin<"__spirv_AtomicAnd", OpenCL_std, Atomic, 4, 4, OpA
defm : DemangledNativeBuiltin<"atomic_exchange", OpenCL_std, Atomic, 2, 4, OpAtomicExchange>;
defm : DemangledNativeBuiltin<"atomic_exchange_explicit", OpenCL_std, Atomic, 2, 4, OpAtomicExchange>;
defm : DemangledNativeBuiltin<"AtomicEx__spirv_change", OpenCL_std, Atomic, 2, 4, OpAtomicExchange>;
+defm : DemangledNativeBuiltin<"__spirv_AtomicExchange", OpenCL_std, Atomic, 4, 4, OpAtomicExchange>;
defm : DemangledNativeBuiltin<"atomic_work_item_fence", OpenCL_std, Atomic, 1, 3, OpMemoryBarrier>;
defm : DemangledNativeBuiltin<"__spirv_MemoryBarrier", OpenCL_std, Atomic, 2, 2, OpMemoryBarrier>;
defm : DemangledNativeBuiltin<"atomic_fetch_add", OpenCL_std, Atomic, 2, 4, OpAtomicIAdd>;
@@ -539,11 +606,11 @@ defm : DemangledNativeBuiltin<"atomic_fetch_sub", OpenCL_std, Atomic, 2, 4, OpAt
defm : DemangledNativeBuiltin<"atomic_fetch_or", OpenCL_std, Atomic, 2, 4, OpAtomicOr>;
defm : DemangledNativeBuiltin<"atomic_fetch_xor", OpenCL_std, Atomic, 2, 4, OpAtomicXor>;
defm : DemangledNativeBuiltin<"atomic_fetch_and", OpenCL_std, Atomic, 2, 4, OpAtomicAnd>;
-defm : DemangledNativeBuiltin<"atomic_fetch_add_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicIAdd>;
-defm : DemangledNativeBuiltin<"atomic_fetch_sub_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicISub>;
-defm : DemangledNativeBuiltin<"atomic_fetch_or_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicOr>;
-defm : DemangledNativeBuiltin<"atomic_fetch_xor_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicXor>;
-defm : DemangledNativeBuiltin<"atomic_fetch_and_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicAnd>;
+defm : DemangledNativeBuiltin<"atomic_fetch_add_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicIAdd>;
+defm : DemangledNativeBuiltin<"atomic_fetch_sub_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicISub>;
+defm : DemangledNativeBuiltin<"atomic_fetch_or_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicOr>;
+defm : DemangledNativeBuiltin<"atomic_fetch_xor_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicXor>;
+defm : DemangledNativeBuiltin<"atomic_fetch_and_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicAnd>;
defm : DemangledNativeBuiltin<"atomic_flag_test_and_set", OpenCL_std, Atomic, 1, 1, OpAtomicFlagTestAndSet>;
defm : DemangledNativeBuiltin<"__spirv_AtomicFlagTestAndSet", OpenCL_std, Atomic, 3, 3, OpAtomicFlagTestAndSet>;
defm : DemangledNativeBuiltin<"atomic_flag_test_and_set_explicit", OpenCL_std, Atomic, 2, 3, OpAtomicFlagTestAndSet>;
@@ -595,6 +662,23 @@ defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy,
defm : DemangledNativeBuiltin<"__spirv_Load", OpenCL_std, LoadStore, 1, 3, OpLoad>;
defm : DemangledNativeBuiltin<"__spirv_Store", OpenCL_std, LoadStore, 2, 4, OpStore>;
+// Address Space Qualifier Functions/Pointers Conversion Instructions:
+defm : DemangledNativeBuiltin<"to_global", OpenCL_std, CastToPtr, 1, 1, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"to_local", OpenCL_std, CastToPtr, 1, 1, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"to_private", OpenCL_std, CastToPtr, 1, 1, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToGlobal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToLocal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToPrivate", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtrExplicit_ToGlobal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtrExplicit_ToLocal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtrExplicit_ToPrivate", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>;
+
+// Cooperative Matrix builtin records:
+defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixLoadKHR", OpenCL_std, CoopMatr, 2, 0, OpCooperativeMatrixLoadKHR>;
+defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixStoreKHR", OpenCL_std, CoopMatr, 3, 0, OpCooperativeMatrixStoreKHR>;
+defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixMulAddKHR", OpenCL_std, CoopMatr, 3, 0, OpCooperativeMatrixMulAddKHR>;
+defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixLengthKHR", OpenCL_std, CoopMatr, 1, 1, OpCooperativeMatrixLengthKHR>;
+
//===----------------------------------------------------------------------===//
// Class defining a work/sub group builtin that should be translated into a
// SPIR-V instruction using the defined properties.
@@ -682,9 +766,17 @@ multiclass DemangledGroupBuiltin<string name, int level /* OnlyWork/OnlySub/...
}
}
+multiclass DemangledGroupBuiltinWrapper<string name, bits<8> minNumArgs, bits<8> maxNumArgs, Op operation> {
+ def : DemangledBuiltin<name, OpenCL_std, Group, minNumArgs, maxNumArgs>;
+ def : GroupBuiltin<name, operation>;
+}
+
defm : DemangledGroupBuiltin<"group_all", WorkOrSub, OpGroupAll>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupAll", 2, 2, OpGroupAll>;
defm : DemangledGroupBuiltin<"group_any", WorkOrSub, OpGroupAny>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupAny", 2, 2, OpGroupAny>;
defm : DemangledGroupBuiltin<"group_broadcast", WorkOrSub, OpGroupBroadcast>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupBroadcast", 3, 3, OpGroupBroadcast>;
defm : DemangledGroupBuiltin<"group_non_uniform_broadcast", OnlySub, OpGroupNonUniformBroadcast>;
defm : DemangledGroupBuiltin<"group_broadcast_first", OnlySub, OpGroupNonUniformBroadcastFirst>;
@@ -719,41 +811,49 @@ defm : DemangledGroupBuiltin<"group_scan_inclusive_adds", WorkOrSub, OpGroupIAdd
defm : DemangledGroupBuiltin<"group_reduce_addu", WorkOrSub, OpGroupIAdd>;
defm : DemangledGroupBuiltin<"group_scan_exclusive_addu", WorkOrSub, OpGroupIAdd>;
defm : DemangledGroupBuiltin<"group_scan_inclusive_addu", WorkOrSub, OpGroupIAdd>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupIAdd", 3, 3, OpGroupIAdd>;
defm : DemangledGroupBuiltin<"group_fadd", WorkOrSub, OpGroupFAdd>;
defm : DemangledGroupBuiltin<"group_reduce_addf", WorkOrSub, OpGroupFAdd>;
defm : DemangledGroupBuiltin<"group_scan_exclusive_addf", WorkOrSub, OpGroupFAdd>;
defm : DemangledGroupBuiltin<"group_scan_inclusive_addf", WorkOrSub, OpGroupFAdd>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupFAdd", 3, 3, OpGroupFAdd>;
defm : DemangledGroupBuiltin<"group_fmin", WorkOrSub, OpGroupFMin>;
defm : DemangledGroupBuiltin<"group_reduce_minf", WorkOrSub, OpGroupFMin>;
defm : DemangledGroupBuiltin<"group_scan_exclusive_minf", WorkOrSub, OpGroupFMin>;
defm : DemangledGroupBuiltin<"group_scan_inclusive_minf", WorkOrSub, OpGroupFMin>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupFMin", 3, 3, OpGroupFMin>;
defm : DemangledGroupBuiltin<"group_umin", WorkOrSub, OpGroupUMin>;
defm : DemangledGroupBuiltin<"group_reduce_minu", WorkOrSub, OpGroupUMin>;
defm : DemangledGroupBuiltin<"group_scan_exclusive_minu", WorkOrSub, OpGroupUMin>;
defm : DemangledGroupBuiltin<"group_scan_inclusive_minu", WorkOrSub, OpGroupUMin>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupUMin", 3, 3, OpGroupUMin>;
defm : DemangledGroupBuiltin<"group_smin", WorkOrSub, OpGroupSMin>;
defm : DemangledGroupBuiltin<"group_reduce_mins", WorkOrSub, OpGroupSMin>;
defm : DemangledGroupBuiltin<"group_scan_exclusive_mins", WorkOrSub, OpGroupSMin>;
defm : DemangledGroupBuiltin<"group_scan_inclusive_mins", WorkOrSub, OpGroupSMin>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupSMin", 3, 3, OpGroupSMin>;
defm : DemangledGroupBuiltin<"group_fmax", WorkOrSub, OpGroupFMax>;
defm : DemangledGroupBuiltin<"group_reduce_maxf", WorkOrSub, OpGroupFMax>;
defm : DemangledGroupBuiltin<"group_scan_exclusive_maxf", WorkOrSub, OpGroupFMax>;
defm : DemangledGroupBuiltin<"group_scan_inclusive_maxf", WorkOrSub, OpGroupFMax>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupFMax", 3, 3, OpGroupFMax>;
defm : DemangledGroupBuiltin<"group_umax", WorkOrSub, OpGroupUMax>;
defm : DemangledGroupBuiltin<"group_reduce_maxu", WorkOrSub, OpGroupUMax>;
defm : DemangledGroupBuiltin<"group_scan_exclusive_maxu", WorkOrSub, OpGroupUMax>;
defm : DemangledGroupBuiltin<"group_scan_inclusive_maxu", WorkOrSub, OpGroupUMax>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupUMax", 3, 3, OpGroupUMax>;
defm : DemangledGroupBuiltin<"group_smax", WorkOrSub, OpGroupSMax>;
defm : DemangledGroupBuiltin<"group_reduce_maxs", WorkOrSub, OpGroupSMax>;
defm : DemangledGroupBuiltin<"group_scan_exclusive_maxs", WorkOrSub, OpGroupSMax>;
defm : DemangledGroupBuiltin<"group_scan_inclusive_maxs", WorkOrSub, OpGroupSMax>;
+defm : DemangledGroupBuiltinWrapper<"__spirv_GroupSMax", 3, 3, OpGroupSMax>;
// cl_khr_subgroup_non_uniform_arithmetic
defm : DemangledGroupBuiltin<"group_non_uniform_iadd", WorkOrSub, OpGroupNonUniformIAdd>;
@@ -997,8 +1097,6 @@ multiclass DemangledAtomicFloatingBuiltin<string name, bits<8> minNumArgs, bits<
defm : DemangledAtomicFloatingBuiltin<"AddEXT", 4, 4, OpAtomicFAddEXT>;
defm : DemangledAtomicFloatingBuiltin<"MinEXT", 4, 4, OpAtomicFMinEXT>;
defm : DemangledAtomicFloatingBuiltin<"MaxEXT", 4, 4, OpAtomicFMaxEXT>;
-// TODO: add support for cl_ext_float_atomics to enable performing atomic operations
-// on floating-point numbers in memory (float arguments for atomic_fetch_add, ...)
//===----------------------------------------------------------------------===//
// Class defining a sub group builtin that should be translated into a
@@ -1407,7 +1505,7 @@ def : BuiltinType<"spirv.DeviceEvent", OpTypeDeviceEvent>;
def : BuiltinType<"spirv.Image", OpTypeImage>;
def : BuiltinType<"spirv.SampledImage", OpTypeSampledImage>;
def : BuiltinType<"spirv.Pipe", OpTypePipe>;
-
+def : BuiltinType<"spirv.CooperativeMatrixKHR", OpTypeCooperativeMatrixKHR>;
//===----------------------------------------------------------------------===//
// Class matching an OpenCL builtin type name to an equivalent SPIR-V
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 75aa1823b11f..c7c244cfa897 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -66,6 +66,8 @@ static const std::map<std::string, SPIRV::Extension::Extension>
SPIRV::Extension::Extension::SPV_INTEL_function_pointers},
{"SPV_KHR_shader_clock",
SPIRV::Extension::Extension::SPV_KHR_shader_clock},
+ {"SPV_KHR_cooperative_matrix",
+ SPIRV::Extension::Extension::SPV_KHR_cooperative_matrix},
};
bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName,
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
index 2ec3fb35ca04..a37e65a47eda 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
@@ -16,6 +16,7 @@
#include "MCTargetDesc/SPIRVBaseInfo.h"
#include "MCTargetDesc/SPIRVMCTargetDesc.h"
+#include "SPIRVUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -51,152 +52,87 @@ public:
void addDep(DTSortableEntry *E) { Deps.push_back(E); }
};
-struct SpecialTypeDescriptor {
- enum SpecialTypeKind {
- STK_Empty = 0,
- STK_Image,
- STK_SampledImage,
- STK_Sampler,
- STK_Pipe,
- STK_DeviceEvent,
- STK_Pointer,
- STK_Last = -1
- };
- SpecialTypeKind Kind;
-
- unsigned Hash;
-
- SpecialTypeDescriptor() = delete;
- SpecialTypeDescriptor(SpecialTypeKind K) : Kind(K) { Hash = Kind; }
-
- unsigned getHash() const { return Hash; }
-
- virtual ~SpecialTypeDescriptor() {}
-};
-
-struct ImageTypeDescriptor : public SpecialTypeDescriptor {
- union ImageAttrs {
- struct BitFlags {
- unsigned Dim : 3;
- unsigned Depth : 2;
- unsigned Arrayed : 1;
- unsigned MS : 1;
- unsigned Sampled : 2;
- unsigned ImageFormat : 6;
- unsigned AQ : 2;
- } Flags;
- unsigned Val;
- };
-
- ImageTypeDescriptor(const Type *SampledTy, unsigned Dim, unsigned Depth,
- unsigned Arrayed, unsigned MS, unsigned Sampled,
- unsigned ImageFormat, unsigned AQ = 0)
- : SpecialTypeDescriptor(SpecialTypeKind::STK_Image) {
- ImageAttrs Attrs;
- Attrs.Val = 0;
- Attrs.Flags.Dim = Dim;
- Attrs.Flags.Depth = Depth;
- Attrs.Flags.Arrayed = Arrayed;
- Attrs.Flags.MS = MS;
- Attrs.Flags.Sampled = Sampled;
- Attrs.Flags.ImageFormat = ImageFormat;
- Attrs.Flags.AQ = AQ;
- Hash = (DenseMapInfo<Type *>().getHashValue(SampledTy) & 0xffff) ^
- ((Attrs.Val << 8) | Kind);
- }
-
- static bool classof(const SpecialTypeDescriptor *TD) {
- return TD->Kind == SpecialTypeKind::STK_Image;
- }
-};
-
-struct SampledImageTypeDescriptor : public SpecialTypeDescriptor {
- SampledImageTypeDescriptor(const Type *SampledTy, const MachineInstr *ImageTy)
- : SpecialTypeDescriptor(SpecialTypeKind::STK_SampledImage) {
- assert(ImageTy->getOpcode() == SPIRV::OpTypeImage);
- ImageTypeDescriptor TD(
- SampledTy, ImageTy->getOperand(2).getImm(),
- ImageTy->getOperand(3).getImm(), ImageTy->getOperand(4).getImm(),
- ImageTy->getOperand(5).getImm(), ImageTy->getOperand(6).getImm(),
- ImageTy->getOperand(7).getImm(), ImageTy->getOperand(8).getImm());
- Hash = TD.getHash() ^ Kind;
- }
-
- static bool classof(const SpecialTypeDescriptor *TD) {
- return TD->Kind == SpecialTypeKind::STK_SampledImage;
- }
-};
-
-struct SamplerTypeDescriptor : public SpecialTypeDescriptor {
- SamplerTypeDescriptor()
- : SpecialTypeDescriptor(SpecialTypeKind::STK_Sampler) {
- Hash = Kind;
- }
-
- static bool classof(const SpecialTypeDescriptor *TD) {
- return TD->Kind == SpecialTypeKind::STK_Sampler;
- }
+enum SpecialTypeKind {
+ STK_Empty = 0,
+ STK_Image,
+ STK_SampledImage,
+ STK_Sampler,
+ STK_Pipe,
+ STK_DeviceEvent,
+ STK_Pointer,
+ STK_Last = -1
};
-struct PipeTypeDescriptor : public SpecialTypeDescriptor {
-
- PipeTypeDescriptor(uint8_t AQ)
- : SpecialTypeDescriptor(SpecialTypeKind::STK_Pipe) {
- Hash = (AQ << 8) | Kind;
- }
-
- static bool classof(const SpecialTypeDescriptor *TD) {
- return TD->Kind == SpecialTypeKind::STK_Pipe;
+using SpecialTypeDescriptor = std::tuple<const Type *, unsigned, unsigned>;
+
+union ImageAttrs {
+ struct BitFlags {
+ unsigned Dim : 3;
+ unsigned Depth : 2;
+ unsigned Arrayed : 1;
+ unsigned MS : 1;
+ unsigned Sampled : 2;
+ unsigned ImageFormat : 6;
+ unsigned AQ : 2;
+ } Flags;
+ unsigned Val;
+
+ ImageAttrs(unsigned Dim, unsigned Depth, unsigned Arrayed, unsigned MS,
+ unsigned Sampled, unsigned ImageFormat, unsigned AQ = 0) {
+ Val = 0;
+ Flags.Dim = Dim;
+ Flags.Depth = Depth;
+ Flags.Arrayed = Arrayed;
+ Flags.MS = MS;
+ Flags.Sampled = Sampled;
+ Flags.ImageFormat = ImageFormat;
+ Flags.AQ = AQ;
}
};
-struct DeviceEventTypeDescriptor : public SpecialTypeDescriptor {
-
- DeviceEventTypeDescriptor()
- : SpecialTypeDescriptor(SpecialTypeKind::STK_DeviceEvent) {
- Hash = Kind;
- }
-
- static bool classof(const SpecialTypeDescriptor *TD) {
- return TD->Kind == SpecialTypeKind::STK_DeviceEvent;
- }
-};
-
-struct PointerTypeDescriptor : public SpecialTypeDescriptor {
- const Type *ElementType;
- unsigned AddressSpace;
-
- PointerTypeDescriptor() = delete;
- PointerTypeDescriptor(const Type *ElementType, unsigned AddressSpace)
- : SpecialTypeDescriptor(SpecialTypeKind::STK_Pointer),
- ElementType(ElementType), AddressSpace(AddressSpace) {
- Hash = (DenseMapInfo<Type *>().getHashValue(ElementType) & 0xffff) ^
- ((AddressSpace << 8) | Kind);
- }
-
- static bool classof(const SpecialTypeDescriptor *TD) {
- return TD->Kind == SpecialTypeKind::STK_Pointer;
- }
-};
+inline SpecialTypeDescriptor
+make_descr_image(const Type *SampledTy, unsigned Dim, unsigned Depth,
+ unsigned Arrayed, unsigned MS, unsigned Sampled,
+ unsigned ImageFormat, unsigned AQ = 0) {
+ return std::make_tuple(
+ SampledTy,
+ ImageAttrs(Dim, Depth, Arrayed, MS, Sampled, ImageFormat, AQ).Val,
+ SpecialTypeKind::STK_Image);
+}
+
+inline SpecialTypeDescriptor
+make_descr_sampled_image(const Type *SampledTy, const MachineInstr *ImageTy) {
+ assert(ImageTy->getOpcode() == SPIRV::OpTypeImage);
+ return std::make_tuple(
+ SampledTy,
+ ImageAttrs(
+ ImageTy->getOperand(2).getImm(), ImageTy->getOperand(3).getImm(),
+ ImageTy->getOperand(4).getImm(), ImageTy->getOperand(5).getImm(),
+ ImageTy->getOperand(6).getImm(), ImageTy->getOperand(7).getImm(),
+ ImageTy->getOperand(8).getImm())
+ .Val,
+ SpecialTypeKind::STK_SampledImage);
+}
+
+inline SpecialTypeDescriptor make_descr_sampler() {
+ return std::make_tuple(nullptr, 0U, SpecialTypeKind::STK_Sampler);
+}
+
+inline SpecialTypeDescriptor make_descr_pipe(uint8_t AQ) {
+ return std::make_tuple(nullptr, AQ, SpecialTypeKind::STK_Pipe);
+}
+
+inline SpecialTypeDescriptor make_descr_event() {
+ return std::make_tuple(nullptr, 0U, SpecialTypeKind::STK_DeviceEvent);
+}
+
+inline SpecialTypeDescriptor make_descr_pointee(const Type *ElementType,
+ unsigned AddressSpace) {
+ return std::make_tuple(ElementType, AddressSpace,
+ SpecialTypeKind::STK_Pointer);
+}
} // namespace SPIRV
-template <> struct DenseMapInfo<SPIRV::SpecialTypeDescriptor> {
- static inline SPIRV::SpecialTypeDescriptor getEmptyKey() {
- return SPIRV::SpecialTypeDescriptor(
- SPIRV::SpecialTypeDescriptor::STK_Empty);
- }
- static inline SPIRV::SpecialTypeDescriptor getTombstoneKey() {
- return SPIRV::SpecialTypeDescriptor(SPIRV::SpecialTypeDescriptor::STK_Last);
- }
- static unsigned getHashValue(SPIRV::SpecialTypeDescriptor Val) {
- return Val.getHash();
- }
- static bool isEqual(SPIRV::SpecialTypeDescriptor LHS,
- SPIRV::SpecialTypeDescriptor RHS) {
- return getHashValue(LHS) == getHashValue(RHS);
- }
-};
-
template <typename KeyTy> class SPIRVDuplicatesTrackerBase {
public:
// NOTE: using MapVector instead of DenseMap helps getting everything ordered
@@ -282,12 +218,12 @@ public:
MachineModuleInfo *MMI);
void add(const Type *Ty, const MachineFunction *MF, Register R) {
- TT.add(Ty, MF, R);
+ TT.add(unifyPtrType(Ty), MF, R);
}
- void add(const Type *PointerElementType, unsigned AddressSpace,
+ void add(const Type *PointeeTy, unsigned AddressSpace,
const MachineFunction *MF, Register R) {
- ST.add(SPIRV::PointerTypeDescriptor(PointerElementType, AddressSpace), MF,
+ ST.add(SPIRV::make_descr_pointee(unifyPtrType(PointeeTy), AddressSpace), MF,
R);
}
@@ -317,13 +253,13 @@ public:
}
Register find(const Type *Ty, const MachineFunction *MF) {
- return TT.find(const_cast<Type *>(Ty), MF);
+ return TT.find(unifyPtrType(Ty), MF);
}
- Register find(const Type *PointerElementType, unsigned AddressSpace,
+ Register find(const Type *PointeeTy, unsigned AddressSpace,
const MachineFunction *MF) {
return ST.find(
- SPIRV::PointerTypeDescriptor(PointerElementType, AddressSpace), MF);
+ SPIRV::make_descr_pointee(unifyPtrType(PointeeTy), AddressSpace), MF);
}
Register find(const Constant *C, const MachineFunction *MF) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 7b8e3230bf55..dd5884096b85 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -69,7 +69,7 @@ class SPIRVEmitIntrinsics
DenseSet<Instruction *> AggrStores;
// deduce element type of untyped pointers
- Type *deduceElementType(Value *I);
+ Type *deduceElementType(Value *I, bool UnknownElemTypeI8);
Type *deduceElementTypeHelper(Value *I);
Type *deduceElementTypeHelper(Value *I, std::unordered_set<Value *> &Visited);
Type *deduceElementTypeByValueDeep(Type *ValueTy, Value *Operand,
@@ -105,7 +105,8 @@ class SPIRVEmitIntrinsics
void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B);
void processInstrAfterVisit(Instruction *I, IRBuilder<> &B);
- void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B);
+ bool insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B,
+ bool UnknownElemTypeI8);
void insertAssignTypeIntrs(Instruction *I, IRBuilder<> &B);
void insertAssignPtrTypeTargetExt(TargetExtType *AssignedType, Value *V,
IRBuilder<> &B);
@@ -367,6 +368,26 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
if (Ty)
break;
}
+ } else if (auto *CI = dyn_cast<CallInst>(I)) {
+ static StringMap<unsigned> ResTypeByArg = {
+ {"to_global", 0},
+ {"to_local", 0},
+ {"to_private", 0},
+ {"__spirv_GenericCastToPtr_ToGlobal", 0},
+ {"__spirv_GenericCastToPtr_ToLocal", 0},
+ {"__spirv_GenericCastToPtr_ToPrivate", 0},
+ {"__spirv_GenericCastToPtrExplicit_ToGlobal", 0},
+ {"__spirv_GenericCastToPtrExplicit_ToLocal", 0},
+ {"__spirv_GenericCastToPtrExplicit_ToPrivate", 0}};
+ // TODO: maybe improve performance by caching demangled names
+ if (Function *CalledF = CI->getCalledFunction()) {
+ std::string DemangledName =
+ getOclOrSpirvBuiltinDemangledName(CalledF->getName());
+ auto AsArgIt = ResTypeByArg.find(DemangledName);
+ if (AsArgIt != ResTypeByArg.end())
+ Ty = deduceElementTypeHelper(CI->getArgOperand(AsArgIt->second),
+ Visited);
+ }
}
// remember the found relationship
@@ -460,10 +481,10 @@ Type *SPIRVEmitIntrinsics::deduceNestedTypeHelper(
return OrigTy;
}
-Type *SPIRVEmitIntrinsics::deduceElementType(Value *I) {
+Type *SPIRVEmitIntrinsics::deduceElementType(Value *I, bool UnknownElemTypeI8) {
if (Type *Ty = deduceElementTypeHelper(I))
return Ty;
- return IntegerType::getInt8Ty(I->getContext());
+ return UnknownElemTypeI8 ? IntegerType::getInt8Ty(I->getContext()) : nullptr;
}
// If the Instruction has Pointer operands with unresolved types, this function
@@ -652,10 +673,8 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
AggrConst = cast<Constant>(COp);
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantAggregateZero>(Op)) {
- if (!Op->getType()->isVectorTy()) {
- AggrConst = cast<Constant>(COp);
- ResTy = B.getInt32Ty();
- }
+ AggrConst = cast<Constant>(COp);
+ ResTy = Op->getType()->isVectorTy() ? COp->getType() : B.getInt32Ty();
}
if (AggrConst) {
SmallVector<Value *> Args;
@@ -1152,16 +1171,23 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
B.CreateIntrinsic(Intrinsic::spv_unref_global, GV.getType(), &GV);
}
-void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
- IRBuilder<> &B) {
+// Return true, if we can't decide what is the pointee type now and will get
+// back to the question later. Return false is spv_assign_ptr_type is not needed
+// or can be inserted immediately.
+bool SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
+ IRBuilder<> &B,
+ bool UnknownElemTypeI8) {
reportFatalOnTokenType(I);
if (!isPointerTy(I->getType()) || !requireAssignType(I) ||
isa<BitCastInst>(I))
- return;
+ return false;
setInsertPointAfterDef(B, I);
- Type *ElemTy = deduceElementType(I);
- buildAssignPtr(B, ElemTy, I);
+ if (Type *ElemTy = deduceElementType(I, UnknownElemTypeI8)) {
+ buildAssignPtr(B, ElemTy, I);
+ return false;
+ }
+ return true;
}
void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
@@ -1199,7 +1225,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
buildAssignPtr(B, PType->getElementType(), Op);
} else if (isPointerTy(OpTy)) {
Type *ElemTy = GR->findDeducedElementType(Op);
- buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op), Op);
+ buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op, true), Op);
} else {
CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type,
{OpTy}, Op, Op, {}, B);
@@ -1235,8 +1261,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
}
bool IsPhi = isa<PHINode>(I), BPrepared = false;
for (const auto &Op : I->operands()) {
- if ((isa<ConstantAggregateZero>(Op) && Op->getType()->isVectorTy()) ||
- isa<PHINode>(I) || isa<SwitchInst>(I))
+ if (isa<PHINode>(I) || isa<SwitchInst>(I))
TrackConstants = false;
if ((isa<ConstantData>(Op) || isa<ConstantExpr>(Op)) && TrackConstants) {
unsigned OpNo = Op.getOperandNo();
@@ -1395,10 +1420,15 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
if (isConvergenceIntrinsic(I))
continue;
- insertAssignPtrTypeIntrs(I, B);
+ bool Postpone = insertAssignPtrTypeIntrs(I, B, false);
+ // if Postpone is true, we can't decide on pointee type yet
insertAssignTypeIntrs(I, B);
insertPtrCastOrAssignTypeInstr(I, B);
insertSpirvDecorations(I, B);
+ // if instruction requires a pointee type set, let's check if we know it
+ // already, and force it to be i8 if not
+ if (Postpone && !GR->findAssignPtrTypeInstr(I))
+ insertAssignPtrTypeIntrs(I, B, true);
}
for (auto &I : instructions(Func))
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index d434e0b5efbc..5558c7a5a4a5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -394,7 +394,7 @@ Register SPIRVGlobalRegistry::getOrCreateCompositeOrNull(
Constant *Val, MachineInstr &I, SPIRVType *SpvType,
const SPIRVInstrInfo &TII, Constant *CA, unsigned BitWidth,
unsigned ElemCnt, bool ZeroAsNull) {
- // Find a constant vector in DT or build a new one.
+ // Find a constant vector or array in DT or build a new one.
Register Res = DT.find(CA, CurMF);
// If no values are attached, the composite is null constant.
bool IsNull = Val->isNullValue() && ZeroAsNull;
@@ -474,20 +474,28 @@ Register SPIRVGlobalRegistry::getOrCreateConstVector(APFloat Val,
ZeroAsNull);
}
-Register
-SPIRVGlobalRegistry::getOrCreateConsIntArray(uint64_t Val, MachineInstr &I,
- SPIRVType *SpvType,
- const SPIRVInstrInfo &TII) {
+Register SPIRVGlobalRegistry::getOrCreateConstIntArray(
+ uint64_t Val, size_t Num, MachineInstr &I, SPIRVType *SpvType,
+ const SPIRVInstrInfo &TII) {
const Type *LLVMTy = getTypeForSPIRVType(SpvType);
assert(LLVMTy->isArrayTy());
const ArrayType *LLVMArrTy = cast<ArrayType>(LLVMTy);
Type *LLVMBaseTy = LLVMArrTy->getElementType();
- auto *ConstInt = ConstantInt::get(LLVMBaseTy, Val);
- auto *ConstArr =
- ConstantArray::get(const_cast<ArrayType *>(LLVMArrTy), {ConstInt});
+ Constant *CI = ConstantInt::get(LLVMBaseTy, Val);
SPIRVType *SpvBaseTy = getSPIRVTypeForVReg(SpvType->getOperand(1).getReg());
unsigned BW = getScalarOrVectorBitWidth(SpvBaseTy);
- return getOrCreateCompositeOrNull(ConstInt, I, SpvType, TII, ConstArr, BW,
+ // The following is reasonably unique key that is better that [Val]. The naive
+ // alternative would be something along the lines of:
+ // SmallVector<Constant *> NumCI(Num, CI);
+ // Constant *UniqueKey =
+ // ConstantArray::get(const_cast<ArrayType*>(LLVMArrTy), NumCI);
+ // that would be a truly unique but dangerous key, because it could lead to
+ // the creation of constants of arbitrary length (that is, the parameter of
+ // memset) which were missing in the original module.
+ Constant *UniqueKey = ConstantStruct::getAnon(
+ {PoisonValue::get(const_cast<ArrayType *>(LLVMArrTy)),
+ ConstantInt::get(LLVMBaseTy, Val), ConstantInt::get(LLVMBaseTy, Num)});
+ return getOrCreateCompositeOrNull(CI, I, SpvType, TII, UniqueKey, BW,
LLVMArrTy->getNumElements());
}
@@ -546,24 +554,6 @@ SPIRVGlobalRegistry::getOrCreateConsIntVector(uint64_t Val,
}
Register
-SPIRVGlobalRegistry::getOrCreateConsIntArray(uint64_t Val,
- MachineIRBuilder &MIRBuilder,
- SPIRVType *SpvType, bool EmitIR) {
- const Type *LLVMTy = getTypeForSPIRVType(SpvType);
- assert(LLVMTy->isArrayTy());
- const ArrayType *LLVMArrTy = cast<ArrayType>(LLVMTy);
- Type *LLVMBaseTy = LLVMArrTy->getElementType();
- const auto ConstInt = ConstantInt::get(LLVMBaseTy, Val);
- auto ConstArr =
- ConstantArray::get(const_cast<ArrayType *>(LLVMArrTy), {ConstInt});
- SPIRVType *SpvBaseTy = getSPIRVTypeForVReg(SpvType->getOperand(1).getReg());
- unsigned BW = getScalarOrVectorBitWidth(SpvBaseTy);
- return getOrCreateIntCompositeOrNull(Val, MIRBuilder, SpvType, EmitIR,
- ConstArr, BW,
- LLVMArrTy->getNumElements());
-}
-
-Register
SPIRVGlobalRegistry::getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder,
SPIRVType *SpvType) {
const Type *LLVMTy = getTypeForSPIRVType(SpvType);
@@ -936,7 +926,7 @@ SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(
SPIRVType *SpirvType = createSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR);
TypesInProcessing.erase(Ty);
VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType;
- SPIRVToLLVMType[SpirvType] = Ty;
+ SPIRVToLLVMType[SpirvType] = unifyPtrType(Ty);
Register Reg = DT.find(Ty, &MIRBuilder.getMF());
// Do not add OpTypeForwardPointer to DT, a corresponding normal pointer type
// will be added later. For special types it is already added to DT.
@@ -1080,12 +1070,14 @@ bool SPIRVGlobalRegistry::isScalarOrVectorSigned(const SPIRVType *Type) const {
return IntType && IntType->getOperand(2).getImm() != 0;
}
+SPIRVType *SPIRVGlobalRegistry::getPointeeType(SPIRVType *PtrType) {
+ return PtrType && PtrType->getOpcode() == SPIRV::OpTypePointer
+ ? getSPIRVTypeForVReg(PtrType->getOperand(2).getReg())
+ : nullptr;
+}
+
unsigned SPIRVGlobalRegistry::getPointeeTypeOp(Register PtrReg) {
- SPIRVType *PtrType = getSPIRVTypeForVReg(PtrReg);
- SPIRVType *ElemType =
- PtrType && PtrType->getOpcode() == SPIRV::OpTypePointer
- ? getSPIRVTypeForVReg(PtrType->getOperand(2).getReg())
- : nullptr;
+ SPIRVType *ElemType = getPointeeType(getSPIRVTypeForVReg(PtrReg));
return ElemType ? ElemType->getOpcode() : 0;
}
@@ -1122,9 +1114,9 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeImage(
uint32_t Depth, uint32_t Arrayed, uint32_t Multisampled, uint32_t Sampled,
SPIRV::ImageFormat::ImageFormat ImageFormat,
SPIRV::AccessQualifier::AccessQualifier AccessQual) {
- SPIRV::ImageTypeDescriptor TD(SPIRVToLLVMType.lookup(SampledType), Dim, Depth,
- Arrayed, Multisampled, Sampled, ImageFormat,
- AccessQual);
+ auto TD = SPIRV::make_descr_image(SPIRVToLLVMType.lookup(SampledType), Dim,
+ Depth, Arrayed, Multisampled, Sampled,
+ ImageFormat, AccessQual);
if (auto *Res = checkSpecialInstr(TD, MIRBuilder))
return Res;
Register ResVReg = createTypeVReg(MIRBuilder);
@@ -1143,7 +1135,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeImage(
SPIRVType *
SPIRVGlobalRegistry::getOrCreateOpTypeSampler(MachineIRBuilder &MIRBuilder) {
- SPIRV::SamplerTypeDescriptor TD;
+ auto TD = SPIRV::make_descr_sampler();
if (auto *Res = checkSpecialInstr(TD, MIRBuilder))
return Res;
Register ResVReg = createTypeVReg(MIRBuilder);
@@ -1154,7 +1146,7 @@ SPIRVGlobalRegistry::getOrCreateOpTypeSampler(MachineIRBuilder &MIRBuilder) {
SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypePipe(
MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier::AccessQualifier AccessQual) {
- SPIRV::PipeTypeDescriptor TD(AccessQual);
+ auto TD = SPIRV::make_descr_pipe(AccessQual);
if (auto *Res = checkSpecialInstr(TD, MIRBuilder))
return Res;
Register ResVReg = createTypeVReg(MIRBuilder);
@@ -1166,7 +1158,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypePipe(
SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeDeviceEvent(
MachineIRBuilder &MIRBuilder) {
- SPIRV::DeviceEventTypeDescriptor TD;
+ auto TD = SPIRV::make_descr_event();
if (auto *Res = checkSpecialInstr(TD, MIRBuilder))
return Res;
Register ResVReg = createTypeVReg(MIRBuilder);
@@ -1176,7 +1168,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeDeviceEvent(
SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeSampledImage(
SPIRVType *ImageType, MachineIRBuilder &MIRBuilder) {
- SPIRV::SampledImageTypeDescriptor TD(
+ auto TD = SPIRV::make_descr_sampled_image(
SPIRVToLLVMType.lookup(MIRBuilder.getMF().getRegInfo().getVRegDef(
ImageType->getOperand(1).getReg())),
ImageType);
@@ -1189,6 +1181,26 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeSampledImage(
.addUse(getSPIRVTypeID(ImageType));
}
+SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeCoopMatr(
+ MachineIRBuilder &MIRBuilder, const TargetExtType *ExtensionType,
+ const SPIRVType *ElemType, uint32_t Scope, uint32_t Rows, uint32_t Columns,
+ uint32_t Use) {
+ Register ResVReg = DT.find(ExtensionType, &MIRBuilder.getMF());
+ if (ResVReg.isValid())
+ return MIRBuilder.getMF().getRegInfo().getUniqueVRegDef(ResVReg);
+ ResVReg = createTypeVReg(MIRBuilder);
+ SPIRVType *SpirvTy =
+ MIRBuilder.buildInstr(SPIRV::OpTypeCooperativeMatrixKHR)
+ .addDef(ResVReg)
+ .addUse(getSPIRVTypeID(ElemType))
+ .addUse(buildConstantInt(Scope, MIRBuilder, nullptr, true))
+ .addUse(buildConstantInt(Rows, MIRBuilder, nullptr, true))
+ .addUse(buildConstantInt(Columns, MIRBuilder, nullptr, true))
+ .addUse(buildConstantInt(Use, MIRBuilder, nullptr, true));
+ DT.add(ExtensionType, &MIRBuilder.getMF(), ResVReg);
+ return SpirvTy;
+}
+
SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeByOpcode(
const Type *Ty, MachineIRBuilder &MIRBuilder, unsigned Opcode) {
Register ResVReg = DT.find(Ty, &MIRBuilder.getMF());
@@ -1268,7 +1280,7 @@ SPIRVType *SPIRVGlobalRegistry::finishCreatingSPIRVType(const Type *LLVMTy,
SPIRVType *SpirvType) {
assert(CurMF == SpirvType->getMF());
VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType;
- SPIRVToLLVMType[SpirvType] = LLVMTy;
+ SPIRVToLLVMType[SpirvType] = unifyPtrType(LLVMTy);
return SpirvType;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index db01f68f48de..a45e1ccd0717 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -292,6 +292,8 @@ public:
return Res->second;
}
+ // Return a pointee's type, or nullptr otherwise.
+ SPIRVType *getPointeeType(SPIRVType *PtrType);
// Return a pointee's type op code, or 0 otherwise.
unsigned getPointeeTypeOp(Register PtrReg);
@@ -327,6 +329,12 @@ public:
return Ret;
}
+ // Return true if the type is an aggregate type.
+ bool isAggregateType(SPIRVType *Type) const {
+ return Type && (Type->getOpcode() == SPIRV::OpTypeStruct &&
+ Type->getOpcode() == SPIRV::OpTypeArray);
+ }
+
// Whether the given VReg has an OpTypeXXX instruction mapped to it with the
// given opcode (e.g. OpTypeFloat).
bool isScalarOfType(Register VReg, unsigned TypeOpcode) const;
@@ -449,13 +457,11 @@ public:
Register getOrCreateConstVector(APFloat Val, MachineInstr &I,
SPIRVType *SpvType, const SPIRVInstrInfo &TII,
bool ZeroAsNull = true);
- Register getOrCreateConsIntArray(uint64_t Val, MachineInstr &I,
- SPIRVType *SpvType,
- const SPIRVInstrInfo &TII);
+ Register getOrCreateConstIntArray(uint64_t Val, size_t Num, MachineInstr &I,
+ SPIRVType *SpvType,
+ const SPIRVInstrInfo &TII);
Register getOrCreateConsIntVector(uint64_t Val, MachineIRBuilder &MIRBuilder,
SPIRVType *SpvType, bool EmitIR = true);
- Register getOrCreateConsIntArray(uint64_t Val, MachineIRBuilder &MIRBuilder,
- SPIRVType *SpvType, bool EmitIR = true);
Register getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder,
SPIRVType *SpvType);
Register buildConstantSampler(Register Res, unsigned AddrMode, unsigned Param,
@@ -514,7 +520,11 @@ public:
SPIRVType *getOrCreateOpTypeSampledImage(SPIRVType *ImageType,
MachineIRBuilder &MIRBuilder);
-
+ SPIRVType *getOrCreateOpTypeCoopMatr(MachineIRBuilder &MIRBuilder,
+ const TargetExtType *ExtensionType,
+ const SPIRVType *ElemType,
+ uint32_t Scope, uint32_t Rows,
+ uint32_t Columns, uint32_t Use);
SPIRVType *
getOrCreateOpTypePipe(MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier::AccessQualifier AccQual);
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 5ccbaf12ddee..4383d1c5c0e2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -339,6 +339,7 @@ void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const {
GR.getSPIRVTypeForVReg(MI.getOperand(1).getReg()));
break;
case SPIRV::OpPtrCastToGeneric:
+ case SPIRV::OpGenericCastToPtr:
validateAccessChain(STI, MRI, GR, MI);
break;
case SPIRV::OpInBoundsPtrAccessChain:
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
index 6fc200abf462..77356b7512a7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -68,6 +68,11 @@ public:
// extra instructions required to preserve validity of SPIR-V code imposed by
// the standard.
void finalizeLowering(MachineFunction &MF) const override;
+
+ MVT getPreferredSwitchConditionType(LLVMContext &Context,
+ EVT ConditionVT) const override {
+ return ConditionVT.getSimpleVT();
+ }
};
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index dedfd5e6e32d..63549b06e967 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -211,6 +211,9 @@ def OpTypeAccelerationStructureNV: Op<5341, (outs TYPE:$res), (ins),
def OpTypeCooperativeMatrixNV: Op<5358, (outs TYPE:$res),
(ins TYPE:$compType, ID:$scope, ID:$rows, ID:$cols),
"$res = OpTypeCooperativeMatrixNV $compType $scope $rows $cols">;
+def OpTypeCooperativeMatrixKHR: Op<4456, (outs TYPE:$res),
+ (ins TYPE:$compType, ID:$scope, ID:$rows, ID:$cols, ID:$use),
+ "$res = OpTypeCooperativeMatrixKHR $compType $scope $rows $cols $use">;
// 3.42.7 Constant-Creation Instructions
@@ -864,3 +867,16 @@ def OpAsmINTEL: Op<5610, (outs ID:$res), (ins TYPE:$type, TYPE:$asm_type, ID:$ta
"$res = OpAsmINTEL $type $asm_type $target $asm">;
def OpAsmCallINTEL: Op<5611, (outs ID:$res), (ins TYPE:$type, ID:$asm, variable_ops),
"$res = OpAsmCallINTEL $type $asm">;
+
+// SPV_KHR_cooperative_matrix
+def OpCooperativeMatrixLoadKHR: Op<4457, (outs ID:$res),
+ (ins TYPE:$resType, ID:$pointer, ID:$memory_layout, variable_ops),
+ "$res = OpCooperativeMatrixLoadKHR $resType $pointer $memory_layout">;
+def OpCooperativeMatrixStoreKHR: Op<4458, (outs),
+ (ins ID:$pointer, ID:$objectToStore, ID:$memory_layout, variable_ops),
+ "OpCooperativeMatrixStoreKHR $pointer $objectToStore $memory_layout">;
+def OpCooperativeMatrixMulAddKHR: Op<4459, (outs ID:$res),
+ (ins TYPE:$type, ID:$A, ID:$B, ID:$C, variable_ops),
+ "$res = OpCooperativeMatrixMulAddKHR $type $A $B $C">;
+def OpCooperativeMatrixLengthKHR: Op<4460, (outs ID:$res), (ins TYPE:$type, ID:$coop_matr_type),
+ "$res = OpCooperativeMatrixLengthKHR $type $coop_matr_type">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index db83172f7fa9..9be736ce88ce 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -173,6 +173,9 @@ private:
bool selectFmix(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ bool selectRsqrt(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
+
void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx) const;
void renderFImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
@@ -469,6 +472,18 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
return selectExtInst(ResVReg, ResType, I, CL::sin, GL::Sin);
case TargetOpcode::G_FTAN:
return selectExtInst(ResVReg, ResType, I, CL::tan, GL::Tan);
+ case TargetOpcode::G_FACOS:
+ return selectExtInst(ResVReg, ResType, I, CL::acos, GL::Acos);
+ case TargetOpcode::G_FASIN:
+ return selectExtInst(ResVReg, ResType, I, CL::asin, GL::Asin);
+ case TargetOpcode::G_FATAN:
+ return selectExtInst(ResVReg, ResType, I, CL::atan, GL::Atan);
+ case TargetOpcode::G_FCOSH:
+ return selectExtInst(ResVReg, ResType, I, CL::cosh, GL::Cosh);
+ case TargetOpcode::G_FSINH:
+ return selectExtInst(ResVReg, ResType, I, CL::sinh, GL::Sinh);
+ case TargetOpcode::G_FTANH:
+ return selectExtInst(ResVReg, ResType, I, CL::tanh, GL::Tanh);
case TargetOpcode::G_FSQRT:
return selectExtInst(ResVReg, ResType, I, CL::sqrt, GL::Sqrt);
@@ -831,7 +846,7 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
unsigned Num = getIConstVal(I.getOperand(2).getReg(), MRI);
SPIRVType *ValTy = GR.getOrCreateSPIRVIntegerType(8, I, TII);
SPIRVType *ArrTy = GR.getOrCreateSPIRVArrayType(ValTy, Num, I, TII);
- Register Const = GR.getOrCreateConsIntArray(Val, I, ArrTy, TII);
+ Register Const = GR.getOrCreateConstIntArray(Val, Num, I, ArrTy, TII);
SPIRVType *VarTy = GR.getOrCreateSPIRVPointerType(
ArrTy, I, TII, SPIRV::StorageClass::UniformConstant);
// TODO: check if we have such GV, add init, use buildGlobalVariable.
@@ -1102,7 +1117,7 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) {
Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass);
SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType(
- SrcPtrTy, I, TII, SPIRV::StorageClass::Generic);
+ GR.getPointeeType(SrcPtrTy), I, TII, SPIRV::StorageClass::Generic);
MachineBasicBlock &BB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
bool Success = BuildMI(BB, I, DL, TII.get(SPIRV::OpPtrCastToGeneric))
@@ -1315,6 +1330,23 @@ bool SPIRVInstructionSelector::selectFmix(Register ResVReg,
.constrainAllUses(TII, TRI, RBI);
}
+bool SPIRVInstructionSelector::selectRsqrt(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+
+ assert(I.getNumOperands() == 3);
+ assert(I.getOperand(2).isReg());
+ MachineBasicBlock &BB = *I.getParent();
+
+ return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450))
+ .addImm(GL::InverseSqrt)
+ .addUse(I.getOperand(2).getReg())
+ .constrainAllUses(TII, TRI, RBI);
+}
+
bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg,
const SPIRVType *ResType,
MachineInstr &I) const {
@@ -1413,20 +1445,50 @@ static unsigned getArrayComponentCount(MachineRegisterInfo *MRI,
}
// Return true if the type represents a constant register
-static bool isConstReg(MachineRegisterInfo *MRI, SPIRVType *OpDef) {
+static bool isConstReg(MachineRegisterInfo *MRI, SPIRVType *OpDef,
+ SmallPtrSet<SPIRVType *, 4> &Visited) {
if (OpDef->getOpcode() == SPIRV::ASSIGN_TYPE &&
OpDef->getOperand(1).isReg()) {
if (SPIRVType *RefDef = MRI->getVRegDef(OpDef->getOperand(1).getReg()))
OpDef = RefDef;
}
- return OpDef->getOpcode() == TargetOpcode::G_CONSTANT ||
- OpDef->getOpcode() == TargetOpcode::G_FCONSTANT;
+
+ if (Visited.contains(OpDef))
+ return true;
+ Visited.insert(OpDef);
+
+ unsigned Opcode = OpDef->getOpcode();
+ switch (Opcode) {
+ case TargetOpcode::G_CONSTANT:
+ case TargetOpcode::G_FCONSTANT:
+ return true;
+ case TargetOpcode::G_INTRINSIC:
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
+ return cast<GIntrinsic>(*OpDef).getIntrinsicID() ==
+ Intrinsic::spv_const_composite;
+ case TargetOpcode::G_BUILD_VECTOR:
+ case TargetOpcode::G_SPLAT_VECTOR: {
+ for (unsigned i = OpDef->getNumExplicitDefs(); i < OpDef->getNumOperands();
+ i++) {
+ SPIRVType *OpNestedDef =
+ OpDef->getOperand(i).isReg()
+ ? MRI->getVRegDef(OpDef->getOperand(i).getReg())
+ : nullptr;
+ if (OpNestedDef && !isConstReg(MRI, OpNestedDef, Visited))
+ return false;
+ }
+ return true;
+ }
+ }
+ return false;
}
// Return true if the virtual register represents a constant
static bool isConstReg(MachineRegisterInfo *MRI, Register OpReg) {
+ SmallPtrSet<SPIRVType *, 4> Visited;
if (SPIRVType *OpDef = MRI->getVRegDef(OpReg))
- return isConstReg(MRI, OpDef);
+ return isConstReg(MRI, OpDef, Visited);
return false;
}
@@ -1740,15 +1802,18 @@ bool SPIRVInstructionSelector::selectOpUndef(Register ResVReg,
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI) {
assert(MO.isReg());
const SPIRVType *TypeInst = MRI->getVRegDef(MO.getReg());
- if (TypeInst->getOpcode() != SPIRV::ASSIGN_TYPE)
- return false;
- assert(TypeInst->getOperand(1).isReg());
- MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg());
- return ImmInst->getOpcode() == TargetOpcode::G_CONSTANT;
+ if (TypeInst->getOpcode() == SPIRV::ASSIGN_TYPE) {
+ assert(TypeInst->getOperand(1).isReg());
+ MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg());
+ return ImmInst->getOpcode() == TargetOpcode::G_CONSTANT;
+ }
+ return TypeInst->getOpcode() == SPIRV::OpConstantI;
}
static int64_t foldImm(const MachineOperand &MO, MachineRegisterInfo *MRI) {
const SPIRVType *TypeInst = MRI->getVRegDef(MO.getReg());
+ if (TypeInst->getOpcode() == SPIRV::OpConstantI)
+ return TypeInst->getOperand(2).getImm();
MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg());
assert(ImmInst->getOpcode() == TargetOpcode::G_CONSTANT);
return ImmInst->getOperand(1).getCImm()->getZExtValue();
@@ -1850,8 +1915,10 @@ bool SPIRVInstructionSelector::wrapIntoSpecConstantOp(
Register OpReg = I.getOperand(i).getReg();
SPIRVType *OpDefine = MRI->getVRegDef(OpReg);
SPIRVType *OpType = GR.getSPIRVTypeForVReg(OpReg);
- if (!OpDefine || !OpType || isConstReg(MRI, OpDefine) ||
- OpDefine->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) {
+ SmallPtrSet<SPIRVType *, 4> Visited;
+ if (!OpDefine || !OpType || isConstReg(MRI, OpDefine, Visited) ||
+ OpDefine->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
+ GR.isAggregateType(OpType)) {
// The case of G_ADDRSPACE_CAST inside spv_const_composite() is processed
// by selectAddrSpaceCast()
CompositeArgs.push_back(OpReg);
@@ -1992,6 +2059,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
return selectAny(ResVReg, ResType, I);
case Intrinsic::spv_lerp:
return selectFmix(ResVReg, ResType, I);
+ case Intrinsic::spv_rsqrt:
+ return selectRsqrt(ResVReg, ResType, I);
case Intrinsic::spv_lifetime_start:
case Intrinsic::spv_lifetime_end: {
unsigned Op = IID == Intrinsic::spv_lifetime_start ? SPIRV::OpLifetimeStart
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 57fbf3b3f8f1..6c7c3af19965 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -278,6 +278,12 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
G_FCOS,
G_FSIN,
G_FTAN,
+ G_FACOS,
+ G_FASIN,
+ G_FATAN,
+ G_FCOSH,
+ G_FSINH,
+ G_FTANH,
G_FSQRT,
G_FFLOOR,
G_FRINT,
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index 2744c25d1bc7..0747dd1bbaf4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -17,6 +17,8 @@
#include "SPIRVSubtarget.h"
#include "SPIRVTargetMachine.h"
#include "SPIRVUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/CFG.h"
@@ -71,7 +73,7 @@ public:
/// terminator will take.
llvm::Value *createExitVariable(
BasicBlock *BB,
- const std::unordered_map<BasicBlock *, ConstantInt *> &TargetToValue) {
+ const DenseMap<BasicBlock *, ConstantInt *> &TargetToValue) {
auto *T = BB->getTerminator();
if (isa<ReturnInst>(T))
return nullptr;
@@ -98,12 +100,12 @@ public:
}
// TODO: add support for switch cases.
- assert(false && "Unhandled terminator type.");
+ llvm_unreachable("Unhandled terminator type.");
}
/// Replaces |BB|'s branch targets present in |ToReplace| with |NewTarget|.
void replaceBranchTargets(BasicBlock *BB,
- const std::unordered_set<BasicBlock *> ToReplace,
+ const SmallPtrSet<BasicBlock *, 4> &ToReplace,
BasicBlock *NewTarget) {
auto *T = BB->getTerminator();
if (isa<ReturnInst>(T))
@@ -133,7 +135,7 @@ public:
bool runOnConvergenceRegionNoRecurse(LoopInfo &LI,
const SPIRV::ConvergenceRegion *CR) {
// Gather all the exit targets for this region.
- std::unordered_set<BasicBlock *> ExitTargets;
+ SmallPtrSet<BasicBlock *, 4> ExitTargets;
for (BasicBlock *Exit : CR->Exits) {
for (BasicBlock *Target : gatherSuccessors(Exit)) {
if (CR->Blocks.count(Target) == 0)
@@ -164,9 +166,10 @@ public:
// Creating one constant per distinct exit target. This will be route to the
// correct target.
- std::unordered_map<BasicBlock *, ConstantInt *> TargetToValue;
+ DenseMap<BasicBlock *, ConstantInt *> TargetToValue;
for (BasicBlock *Target : SortedExitTargets)
- TargetToValue.emplace(Target, Builder.getInt32(TargetToValue.size()));
+ TargetToValue.insert(
+ std::make_pair(Target, Builder.getInt32(TargetToValue.size())));
// Creating one variable per exit node, set to the constant matching the
// targeted external block.
@@ -184,12 +187,12 @@ public:
}
// Creating the switch to jump to the correct exit target.
- std::vector<std::pair<BasicBlock *, ConstantInt *>> CasesList(
- TargetToValue.begin(), TargetToValue.end());
- llvm::SwitchInst *Sw =
- Builder.CreateSwitch(node, CasesList[0].first, CasesList.size() - 1);
- for (size_t i = 1; i < CasesList.size(); i++)
- Sw->addCase(CasesList[i].second, CasesList[i].first);
+ llvm::SwitchInst *Sw = Builder.CreateSwitch(node, SortedExitTargets[0],
+ SortedExitTargets.size() - 1);
+ for (size_t i = 1; i < SortedExitTargets.size(); i++) {
+ BasicBlock *BB = SortedExitTargets[i];
+ Sw->addCase(TargetToValue[BB], BB);
+ }
// Fix exit branches to redirect to the new exit.
for (auto Exit : CR->Exits)
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 30a6c474f467..ac0aa682ea4b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1168,6 +1168,15 @@ void addInstrRequirements(const MachineInstr &MI,
Reqs.addCapability(SPIRV::Capability::AsmINTEL);
}
break;
+ case SPIRV::OpTypeCooperativeMatrixKHR:
+ if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_cooperative_matrix))
+ report_fatal_error(
+ "OpTypeCooperativeMatrixKHR type requires the "
+ "following SPIR-V extension: SPV_KHR_cooperative_matrix",
+ false);
+ Reqs.addExtension(SPIRV::Extension::SPV_KHR_cooperative_matrix);
+ Reqs.addCapability(SPIRV::Capability::CooperativeMatrixKHR);
+ break;
default:
break;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index adc5b36af6f1..0ea2f176565e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -41,7 +41,8 @@ public:
static void
addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
const SPIRVSubtarget &STI,
- DenseMap<MachineInstr *, Type *> &TargetExtConstTypes) {
+ DenseMap<MachineInstr *, Type *> &TargetExtConstTypes,
+ SmallSet<Register, 4> &TrackedConstRegs) {
MachineRegisterInfo &MRI = MF.getRegInfo();
DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT;
SmallVector<MachineInstr *, 10> ToErase, ToEraseComposites;
@@ -80,6 +81,7 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
}
}
GR->add(Const, &MF, SrcReg);
+ TrackedConstRegs.insert(SrcReg);
if (Const->getType()->isTargetExtTy()) {
// remember association so that we can restore it when assign types
MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
@@ -121,7 +123,9 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MI->eraseFromParent();
}
-static void foldConstantsIntoIntrinsics(MachineFunction &MF) {
+static void
+foldConstantsIntoIntrinsics(MachineFunction &MF,
+ const SmallSet<Register, 4> &TrackedConstRegs) {
SmallVector<MachineInstr *, 10> ToErase;
MachineRegisterInfo &MRI = MF.getRegInfo();
const unsigned AssignNameOperandShift = 2;
@@ -137,7 +141,8 @@ static void foldConstantsIntoIntrinsics(MachineFunction &MF) {
MI.removeOperand(NumOp);
MI.addOperand(MachineOperand::CreateImm(
ConstMI->getOperand(1).getCImm()->getZExtValue()));
- if (MRI.use_empty(ConstMI->getOperand(0).getReg()))
+ Register DefReg = ConstMI->getOperand(0).getReg();
+ if (MRI.use_empty(DefReg) && !TrackedConstRegs.contains(DefReg))
ToErase.push_back(ConstMI);
}
}
@@ -271,6 +276,21 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
return SpirvTy;
}
+// To support current approach and limitations wrt. bit width here we widen a
+// scalar register with a bit width greater than 1 to valid sizes and cap it to
+// 64 width.
+static void widenScalarLLTNextPow2(Register Reg, MachineRegisterInfo &MRI) {
+ LLT RegType = MRI.getType(Reg);
+ if (!RegType.isScalar())
+ return;
+ unsigned Sz = RegType.getScalarSizeInBits();
+ if (Sz == 1)
+ return;
+ unsigned NewSz = std::min(std::max(1u << Log2_32_Ceil(Sz), 8u), 64u);
+ if (NewSz != Sz)
+ MRI.setType(Reg, LLT::scalar(NewSz));
+}
+
static std::pair<Register, unsigned>
createNewIdReg(SPIRVType *SpvType, Register SrcReg, MachineRegisterInfo &MRI,
const SPIRVGlobalRegistry &GR) {
@@ -395,6 +415,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<MachineInstr *, 10> ToErase;
+ DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT;
for (MachineBasicBlock *MBB : post_order(&MF)) {
if (MBB->empty())
@@ -406,6 +427,11 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MachineInstr &MI = *MII;
unsigned MIOp = MI.getOpcode();
+ // validate bit width of scalar registers
+ for (const auto &MOP : MI.operands())
+ if (MOP.isReg())
+ widenScalarLLTNextPow2(MOP.getReg(), MRI);
+
if (isSpvIntrinsic(MI, Intrinsic::spv_assign_ptr_type)) {
Register Reg = MI.getOperand(1).getReg();
MIB.setInsertPt(*MI.getParent(), MI.getIterator());
@@ -441,6 +467,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
// %rctmp = G_CONSTANT ty Val
// %rc = ASSIGN_TYPE %rctmp, %cty
Register Reg = MI.getOperand(0).getReg();
+ bool NeedAssignType = true;
if (MRI.hasOneUse(Reg)) {
MachineInstr &UseMI = *MRI.use_instr_begin(Reg);
if (isSpvIntrinsic(UseMI, Intrinsic::spv_assign_type) ||
@@ -453,7 +480,20 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
Ty = TargetExtIt == TargetExtConstTypes.end()
? MI.getOperand(1).getCImm()->getType()
: TargetExtIt->second;
- GR->add(MI.getOperand(1).getCImm(), &MF, Reg);
+ const ConstantInt *OpCI = MI.getOperand(1).getCImm();
+ Register PrimaryReg = GR->find(OpCI, &MF);
+ if (!PrimaryReg.isValid()) {
+ GR->add(OpCI, &MF, Reg);
+ } else if (PrimaryReg != Reg &&
+ MRI.getType(Reg) == MRI.getType(PrimaryReg)) {
+ auto *RCReg = MRI.getRegClassOrNull(Reg);
+ auto *RCPrimary = MRI.getRegClassOrNull(PrimaryReg);
+ if (!RCReg || RCPrimary == RCReg) {
+ RegsAlreadyAddedToDT[&MI] = PrimaryReg;
+ ToErase.push_back(&MI);
+ NeedAssignType = false;
+ }
+ }
} else if (MIOp == TargetOpcode::G_FCONSTANT) {
Ty = MI.getOperand(1).getFPImm()->getType();
} else {
@@ -472,14 +512,10 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
MI.getNumExplicitOperands() - MI.getNumExplicitDefs();
Ty = VectorType::get(ElemTy, NumElts, false);
}
- insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
+ if (NeedAssignType)
+ insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
} else if (MIOp == TargetOpcode::G_GLOBAL_VALUE) {
propagateSPIRVType(&MI, GR, MRI, MIB);
- } else if (MIOp == TargetOpcode::G_BITREVERSE) {
- Register Reg = MI.getOperand(0).getReg();
- LLT RegType = MRI.getType(Reg);
- if (RegType.getSizeInBits() < 32)
- MRI.setType(Reg, LLT::scalar(32));
}
if (MII == Begin)
@@ -488,8 +524,12 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
--MII;
}
}
- for (MachineInstr *MI : ToErase)
+ for (MachineInstr *MI : ToErase) {
+ auto It = RegsAlreadyAddedToDT.find(MI);
+ if (RegsAlreadyAddedToDT.contains(MI))
+ MRI.replaceRegWith(MI->getOperand(0).getReg(), It->second);
MI->eraseFromParent();
+ }
// Address the case when IRTranslator introduces instructions with new
// registers without SPIRVType associated.
@@ -821,8 +861,10 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
MachineIRBuilder MIB(MF);
// a registry of target extension constants
DenseMap<MachineInstr *, Type *> TargetExtConstTypes;
- addConstantsToTrack(MF, GR, ST, TargetExtConstTypes);
- foldConstantsIntoIntrinsics(MF);
+ // to keep record of tracked constants
+ SmallSet<Register, 4> TrackedConstRegs;
+ addConstantsToTrack(MF, GR, ST, TargetExtConstTypes, TrackedConstRegs);
+ foldConstantsIntoIntrinsics(MF, TrackedConstRegs);
insertBitcasts(MF, GR, MIB);
generateAssignInstrs(MF, GR, MIB, TargetExtConstTypes);
processSwitches(MF, GR, MIB);
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 318c5cebb7a4..96601dd8796c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -302,6 +302,7 @@ defm SPV_INTEL_inline_assembly : ExtensionOperand<107>;
defm SPV_INTEL_cache_controls : ExtensionOperand<108>;
defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>;
defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>;
+defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>;
//===----------------------------------------------------------------------===//
// Multiclass used to define Capabilities enum values and at the same time
@@ -478,6 +479,7 @@ defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_gl
defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
defm GlobalVariableFPGADecorationsINTEL : CapabilityOperand<6189, 0, 0, [SPV_INTEL_global_variable_fpga_decorations], []>;
defm CacheControlsINTEL : CapabilityOperand<6441, 0, 0, [SPV_INTEL_cache_controls], []>;
+defm CooperativeMatrixKHR : CapabilityOperand<6022, 0, 0, [SPV_KHR_cooperative_matrix], []>;
//===----------------------------------------------------------------------===//
// Multiclass used to define SourceLanguage enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index c1b90b0e9d88..927683ad7e32 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -253,7 +253,11 @@ SPIRV::MemorySemantics::MemorySemantics getMemSemantics(AtomicOrdering Ord) {
MachineInstr *getDefInstrMaybeConstant(Register &ConstReg,
const MachineRegisterInfo *MRI) {
- MachineInstr *ConstInstr = MRI->getVRegDef(ConstReg);
+ MachineInstr *MI = MRI->getVRegDef(ConstReg);
+ MachineInstr *ConstInstr =
+ MI->getOpcode() == SPIRV::G_TRUNC || MI->getOpcode() == SPIRV::G_ZEXT
+ ? MRI->getVRegDef(MI->getOperand(1).getReg())
+ : MI;
if (auto *GI = dyn_cast<GIntrinsic>(ConstInstr)) {
if (GI->is(Intrinsic::spv_track_constant)) {
ConstReg = ConstInstr->getOperand(2).getReg();
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index c131eecb1c13..12725d6bac14 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -160,5 +160,29 @@ inline Type *toTypedPointer(Type *Ty) {
: Ty;
}
+inline Type *toTypedFunPointer(FunctionType *FTy) {
+ Type *OrigRetTy = FTy->getReturnType();
+ Type *RetTy = toTypedPointer(OrigRetTy);
+ bool IsUntypedPtr = false;
+ for (Type *PTy : FTy->params()) {
+ if (isUntypedPointerTy(PTy)) {
+ IsUntypedPtr = true;
+ break;
+ }
+ }
+ if (!IsUntypedPtr && RetTy == OrigRetTy)
+ return FTy;
+ SmallVector<Type *> ParamTys;
+ for (Type *PTy : FTy->params())
+ ParamTys.push_back(toTypedPointer(PTy));
+ return FunctionType::get(RetTy, ParamTys, FTy->isVarArg());
+}
+
+inline const Type *unifyPtrType(const Type *Ty) {
+ if (auto FTy = dyn_cast<FunctionType>(Ty))
+ return toTypedFunPointer(const_cast<FunctionType *>(FTy));
+ return toTypedPointer(const_cast<Type *>(Ty));
+}
+
} // namespace llvm
#endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 3d8637bb8c35..af634a7da71c 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -107,10 +107,15 @@ class SparcAsmParser : public MCTargetAsmParser {
ParseStatus parseBranchModifiers(OperandVector &Operands);
+ ParseStatus parseExpression(int64_t &Val);
+
// Helper function for dealing with %lo / %hi in PIC mode.
const SparcMCExpr *adjustPICRelocation(SparcMCExpr::VariantKind VK,
const MCExpr *subExpr);
+ // Helper function to see if current token can start an expression.
+ bool isPossibleExpression(const AsmToken &Token);
+
// returns true if Tok is matched to a register and returns register in RegNo.
MCRegister matchRegisterName(const AsmToken &Tok, unsigned &RegKind);
@@ -1085,38 +1090,35 @@ ParseStatus SparcAsmParser::parseASITag(OperandVector &Operands) {
SMLoc E = Parser.getTok().getEndLoc();
int64_t ASIVal = 0;
- switch (getLexer().getKind()) {
- case AsmToken::LParen:
- case AsmToken::Integer:
- case AsmToken::Identifier:
- case AsmToken::Plus:
- case AsmToken::Minus:
- case AsmToken::Tilde:
- if (getParser().parseAbsoluteExpression(ASIVal) || !isUInt<8>(ASIVal))
- return Error(S, "invalid ASI number, must be between 0 and 255");
- break;
- case AsmToken::Hash: {
- // For now we only support named tags for 64-bit/V9 systems.
- // TODO: add support for 32-bit/V8 systems.
- SMLoc TagStart = getLexer().peekTok(false).getLoc();
- Parser.Lex(); // Eat the '#'.
- const StringRef ASIName = Parser.getTok().getString();
- const SparcASITag::ASITag *ASITag =
- SparcASITag::lookupASITagByName(ASIName);
- if (!ASITag)
- ASITag = SparcASITag::lookupASITagByAltName(ASIName);
- Parser.Lex(); // Eat the identifier token.
+ if (getLexer().getKind() != AsmToken::Hash) {
+ // If the ASI tag provided is not a named tag, then it
+ // must be a constant expression.
+ ParseStatus ParseExprStatus = parseExpression(ASIVal);
+ if (!ParseExprStatus.isSuccess())
+ return ParseExprStatus;
- if (!ASITag)
- return Error(TagStart, "unknown ASI tag");
+ if (!isUInt<8>(ASIVal))
+ return Error(S, "invalid ASI number, must be between 0 and 255");
- ASIVal = ASITag->Encoding;
- break;
- }
- default:
- return ParseStatus::NoMatch;
+ Operands.push_back(SparcOperand::CreateASITag(ASIVal, S, E));
+ return ParseStatus::Success;
}
+ // For now we only support named tags for 64-bit/V9 systems.
+ // TODO: add support for 32-bit/V8 systems.
+ SMLoc TagStart = getLexer().peekTok(false).getLoc();
+ Parser.Lex(); // Eat the '#'.
+ const StringRef ASIName = Parser.getTok().getString();
+ const SparcASITag::ASITag *ASITag = SparcASITag::lookupASITagByName(ASIName);
+ if (!ASITag)
+ ASITag = SparcASITag::lookupASITagByAltName(ASIName);
+ Parser.Lex(); // Eat the identifier token.
+
+ if (!ASITag)
+ return Error(TagStart, "unknown ASI tag");
+
+ ASIVal = ASITag->Encoding;
+
Operands.push_back(SparcOperand::CreateASITag(ASIVal, S, E));
return ParseStatus::Success;
}
@@ -1126,35 +1128,32 @@ ParseStatus SparcAsmParser::parsePrefetchTag(OperandVector &Operands) {
SMLoc E = Parser.getTok().getEndLoc();
int64_t PrefetchVal = 0;
- switch (getLexer().getKind()) {
- case AsmToken::LParen:
- case AsmToken::Integer:
- case AsmToken::Identifier:
- case AsmToken::Plus:
- case AsmToken::Minus:
- case AsmToken::Tilde:
- if (getParser().parseAbsoluteExpression(PrefetchVal) ||
- !isUInt<5>(PrefetchVal))
- return Error(S, "invalid prefetch number, must be between 0 and 31");
- break;
- case AsmToken::Hash: {
- SMLoc TagStart = getLexer().peekTok(false).getLoc();
- Parser.Lex(); // Eat the '#'.
- const StringRef PrefetchName = Parser.getTok().getString();
- const SparcPrefetchTag::PrefetchTag *PrefetchTag =
- SparcPrefetchTag::lookupPrefetchTagByName(PrefetchName);
- Parser.Lex(); // Eat the identifier token.
+ if (getLexer().getKind() != AsmToken::Hash) {
+ // If the prefetch tag provided is not a named tag, then it
+ // must be a constant expression.
+ ParseStatus ParseExprStatus = parseExpression(PrefetchVal);
+ if (!ParseExprStatus.isSuccess())
+ return ParseExprStatus;
- if (!PrefetchTag)
- return Error(TagStart, "unknown prefetch tag");
+ if (!isUInt<8>(PrefetchVal))
+ return Error(S, "invalid prefetch number, must be between 0 and 31");
- PrefetchVal = PrefetchTag->Encoding;
- break;
- }
- default:
- return ParseStatus::NoMatch;
+ Operands.push_back(SparcOperand::CreatePrefetchTag(PrefetchVal, S, E));
+ return ParseStatus::Success;
}
+ SMLoc TagStart = getLexer().peekTok(false).getLoc();
+ Parser.Lex(); // Eat the '#'.
+ const StringRef PrefetchName = Parser.getTok().getString();
+ const SparcPrefetchTag::PrefetchTag *PrefetchTag =
+ SparcPrefetchTag::lookupPrefetchTagByName(PrefetchName);
+ Parser.Lex(); // Eat the identifier token.
+
+ if (!PrefetchTag)
+ return Error(TagStart, "unknown prefetch tag");
+
+ PrefetchVal = PrefetchTag->Encoding;
+
Operands.push_back(SparcOperand::CreatePrefetchTag(PrefetchVal, S, E));
return ParseStatus::Success;
}
@@ -1370,6 +1369,15 @@ ParseStatus SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
return ParseStatus::Success;
}
+ParseStatus SparcAsmParser::parseExpression(int64_t &Val) {
+ AsmToken Tok = getLexer().getTok();
+
+ if (!isPossibleExpression(Tok))
+ return ParseStatus::NoMatch;
+
+ return getParser().parseAbsoluteExpression(Val);
+}
+
#define GET_REGISTER_MATCHER
#include "SparcGenAsmMatcher.inc"
@@ -1600,6 +1608,20 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
return true;
}
+bool SparcAsmParser::isPossibleExpression(const AsmToken &Token) {
+ switch (Token.getKind()) {
+ case AsmToken::LParen:
+ case AsmToken::Integer:
+ case AsmToken::Identifier:
+ case AsmToken::Plus:
+ case AsmToken::Minus:
+ case AsmToken::Tilde:
+ return true;
+ default:
+ return false;
+ }
+}
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmParser() {
RegisterMCAsmParser<SparcAsmParser> A(getTheSparcTarget());
RegisterMCAsmParser<SparcAsmParser> B(getTheSparcV9Target());
diff --git a/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/llvm/lib/Target/Sparc/SparcInstr64Bit.td
index 93862414fb35..6b7813745165 100644
--- a/llvm/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/llvm/lib/Target/Sparc/SparcInstr64Bit.td
@@ -478,7 +478,7 @@ def : Pat<(i64 (atomic_load_64 ADDRri:$src)), (LDXri ADDRri:$src)>;
def : Pat<(atomic_store_64 i64:$val, ADDRrr:$dst), (STXrr ADDRrr:$dst, $val)>;
def : Pat<(atomic_store_64 i64:$val, ADDRri:$dst), (STXri ADDRri:$dst, $val)>;
-def : Pat<(atomic_cmp_swap_64 i64:$rs1, i64:$rs2, i64:$swap),
+def : Pat<(atomic_cmp_swap_i64 i64:$rs1, i64:$rs2, i64:$swap),
(CASXArr $rs1, $rs2, $swap, 0x80)>;
} // Predicates = [Is64Bit]
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index cac96a139872..7b074231ec62 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -744,11 +744,11 @@ let Constraints = "$val = $rd" in {
def SWAPrr : F3_1<3, 0b001111,
(outs IntRegs:$rd), (ins (MEMrr $rs1, $rs2):$addr, IntRegs:$val),
"swap [$addr], $rd",
- [(set i32:$rd, (atomic_swap_32 ADDRrr:$addr, i32:$val))]>;
+ [(set i32:$rd, (atomic_swap_i32 ADDRrr:$addr, i32:$val))]>;
def SWAPri : F3_2<3, 0b001111,
(outs IntRegs:$rd), (ins (MEMri $rs1, $simm13):$addr, IntRegs:$val),
"swap [$addr], $rd",
- [(set i32:$rd, (atomic_swap_32 ADDRri:$addr, i32:$val))]>;
+ [(set i32:$rd, (atomic_swap_i32 ADDRri:$addr, i32:$val))]>;
def SWAPArr : F3_1_asi<3, 0b011111,
(outs IntRegs:$rd), (ins (MEMrr $rs1, $rs2):$addr, ASITag:$asi, IntRegs:$val),
"swapa [$addr] $asi, $rd",
@@ -1913,12 +1913,12 @@ def : Pat<(atomic_store_32 i32:$val, ADDRrr:$dst), (STrr ADDRrr:$dst, $val)>;
def : Pat<(atomic_store_32 i32:$val, ADDRri:$dst), (STri ADDRri:$dst, $val)>;
let Predicates = [HasV9] in
-def : Pat<(atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap),
+def : Pat<(atomic_cmp_swap_i32 iPTR:$rs1, i32:$rs2, i32:$swap),
(CASArr $rs1, $rs2, $swap, 0x80)>;
// Same pattern as CASArr above, but with a different ASI.
let Predicates = [HasLeonCASA] in
-def : Pat<(atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap),
+def : Pat<(atomic_cmp_swap_i32 iPTR:$rs1, i32:$rs2, i32:$swap),
(CASArr $rs1, $rs2, $swap, 0x0A)>;
// A register pair with zero upper half.
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 7f3a143aad97..7c6ab3f9b1ab 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -1733,16 +1733,16 @@ let hasSideEffects = 1 in
def Serialize : Alias<2, (outs), (ins), []>;
let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
- def LAA : LoadAndOpRSY<"laa", 0xEBF8, atomic_load_add_32, GR32>;
- def LAAG : LoadAndOpRSY<"laag", 0xEBE8, atomic_load_add_64, GR64>;
+ def LAA : LoadAndOpRSY<"laa", 0xEBF8, atomic_load_add_i32, GR32>;
+ def LAAG : LoadAndOpRSY<"laag", 0xEBE8, atomic_load_add_i64, GR64>;
def LAAL : LoadAndOpRSY<"laal", 0xEBFA, null_frag, GR32>;
def LAALG : LoadAndOpRSY<"laalg", 0xEBEA, null_frag, GR64>;
- def LAN : LoadAndOpRSY<"lan", 0xEBF4, atomic_load_and_32, GR32>;
- def LANG : LoadAndOpRSY<"lang", 0xEBE4, atomic_load_and_64, GR64>;
- def LAO : LoadAndOpRSY<"lao", 0xEBF6, atomic_load_or_32, GR32>;
- def LAOG : LoadAndOpRSY<"laog", 0xEBE6, atomic_load_or_64, GR64>;
- def LAX : LoadAndOpRSY<"lax", 0xEBF7, atomic_load_xor_32, GR32>;
- def LAXG : LoadAndOpRSY<"laxg", 0xEBE7, atomic_load_xor_64, GR64>;
+ def LAN : LoadAndOpRSY<"lan", 0xEBF4, atomic_load_and_i32, GR32>;
+ def LANG : LoadAndOpRSY<"lang", 0xEBE4, atomic_load_and_i64, GR64>;
+ def LAO : LoadAndOpRSY<"lao", 0xEBF6, atomic_load_or_i32, GR32>;
+ def LAOG : LoadAndOpRSY<"laog", 0xEBE6, atomic_load_or_i64, GR64>;
+ def LAX : LoadAndOpRSY<"lax", 0xEBF7, atomic_load_xor_i32, GR32>;
+ def LAXG : LoadAndOpRSY<"laxg", 0xEBE7, atomic_load_xor_i64, GR64>;
}
def ATOMIC_SWAPW : AtomicLoadWBinaryReg<z_atomic_swapw>;
diff --git a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
index 8073ed0e2a3c..bf8d109ff71f 100644
--- a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -58,7 +58,7 @@ FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) {
void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -75,7 +75,8 @@ bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
return false;
}
- MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree *DT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
return VisitNode(DT->getRootNode(), 0);
}
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index cbad5a0eafb2..75ef3b7336db 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -1158,9 +1158,9 @@ defm ATMAM : RRCASm<"atmam", 0x53, I64, i64, uimm0to2>;
// Section 8.2.20 - CAS (Compare and Swap)
let DecoderMethod = "DecodeCASI64" in
-defm CASL : RRCASm<"cas.l", 0x62, I64, i64, simm7, atomic_cmp_swap_64>;
+defm CASL : RRCASm<"cas.l", 0x62, I64, i64, simm7, atomic_cmp_swap_i64>;
let DecoderMethod = "DecodeCASI32", cx = 1 in
-defm CASW : RRCASm<"cas.w", 0x62, I32, i32, simm7, atomic_cmp_swap_32>;
+defm CASW : RRCASm<"cas.w", 0x62, I32, i32, simm7, atomic_cmp_swap_i32>;
//-----------------------------------------------------------------------------
// Section 8.3 - Transfer Control Instructions
@@ -1896,9 +1896,9 @@ defm : TRATMSTm<atomic_store_32, STLrri, STLrii, STLzri, STLzii>;
// Atomic swaps
def : Pat<(i32 (ts1am i64:$src, i32:$flag, i32:$new)),
(TS1AMWrir $src, 0, $flag, $new)>;
-def : Pat<(i32 (atomic_swap_32 ADDRri:$src, i32:$new)),
+def : Pat<(i32 (atomic_swap_i32 ADDRri:$src, i32:$new)),
(TS1AMWrii MEMriRRM:$src, 15, $new)>;
-def : Pat<(i64 (atomic_swap_64 ADDRri:$src, i64:$new)),
+def : Pat<(i64 (atomic_swap_i64 ADDRri:$src, i64:$new)),
(TS1AMLrir MEMriRRM:$src, (LEAzii 0, 0, 255), i64:$new)>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index f5bc584ac4e1..3cc2cc0e830f 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -757,7 +757,7 @@ public:
bool CheckDataSection() {
if (CurrentState != DataSection) {
auto WS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
- if (WS && WS->getKind().isText())
+ if (WS && WS->isText())
return error("data directive must occur in a data segment: ",
Lexer.getTok());
}
@@ -1074,7 +1074,7 @@ public:
void doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) override {
// Code below only applies to labels in text sections.
auto CWS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
- if (!CWS || !CWS->getKind().isText())
+ if (!CWS || !CWS->isText())
return;
auto WasmSym = cast<MCSymbolWasm>(Symbol);
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 43c67b4b4749..b76179b1cf6e 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -122,7 +122,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(
return wasm::R_WASM_MEMORY_ADDR_LEB64;
case FK_Data_4:
if (SymA.isFunction()) {
- if (FixupSection.getKind().isMetadata())
+ if (FixupSection.isMetadata())
return wasm::R_WASM_FUNCTION_OFFSET_I32;
assert(FixupSection.isWasmData());
return wasm::R_WASM_TABLE_INDEX_I32;
@@ -131,7 +131,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(
return wasm::R_WASM_GLOBAL_INDEX_I32;
if (auto Section = static_cast<const MCSectionWasm *>(
getTargetSection(Fixup.getValue()))) {
- if (Section->getKind().isText())
+ if (Section->isText())
return wasm::R_WASM_FUNCTION_OFFSET_I32;
else if (!Section->isWasmData())
return wasm::R_WASM_SECTION_OFFSET_I32;
@@ -140,7 +140,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(
: wasm::R_WASM_MEMORY_ADDR_I32;
case FK_Data_8:
if (SymA.isFunction()) {
- if (FixupSection.getKind().isMetadata())
+ if (FixupSection.isMetadata())
return wasm::R_WASM_FUNCTION_OFFSET_I64;
return wasm::R_WASM_TABLE_INDEX_I64;
}
@@ -148,7 +148,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(
llvm_unreachable("unimplemented R_WASM_GLOBAL_INDEX_I64");
if (auto Section = static_cast<const MCSectionWasm *>(
getTargetSection(Fixup.getValue()))) {
- if (Section->getKind().isText())
+ if (Section->isText())
return wasm::R_WASM_FUNCTION_OFFSET_I64;
else if (!Section->isWasmData())
llvm_unreachable("unimplemented R_WASM_SECTION_OFFSET_I64");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 0b7ec6e74cab..b0a97c725c87 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -319,8 +319,8 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) {
// Emit .globaltype, .tagtype, or .tabletype declarations for extern
// declarations, i.e. those that have only been declared (but not defined)
// in the current module
- auto Sym = cast<MCSymbolWasm>(It.getValue());
- if (!Sym->isDefined())
+ auto Sym = cast_or_null<MCSymbolWasm>(It.getValue().Symbol);
+ if (Sym && !Sym->isDefined())
emitSymbolType(Sym);
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index 06758e465197..f746bf4307a0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -53,8 +53,8 @@ class WebAssemblyCFGSort final : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
AU.addRequired<WebAssemblyExceptionInfo>();
@@ -387,7 +387,7 @@ bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) {
const auto &MLI = getAnalysis<MachineLoopInfo>();
const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
- auto &MDT = getAnalysis<MachineDominatorTree>();
+ auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
// Liveness is not tracked for VALUE_STACK physreg.
MF.getRegInfo().invalidateLiveness();
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index d8cbddf74545..77e82a32545f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -48,7 +48,7 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
StringRef getPassName() const override { return "WebAssembly CFG Stackify"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineLoopInfo>();
AU.addRequired<WebAssemblyExceptionInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -252,7 +252,7 @@ void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) {
void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
assert(!MBB.isEHPad());
MachineFunction &MF = *MBB.getParent();
- auto &MDT = getAnalysis<MachineDominatorTree>();
+ auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
@@ -465,7 +465,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
assert(MBB.isEHPad());
MachineFunction &MF = *MBB.getParent();
- auto &MDT = getAnalysis<MachineDominatorTree>();
+ auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
const auto &MLI = getAnalysis<MachineLoopInfo>();
const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
index 8deac76b2bc3..b312ca7f5346 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
@@ -31,7 +31,7 @@ char WebAssemblyExceptionInfo::ID = 0;
INITIALIZE_PASS_BEGIN(WebAssemblyExceptionInfo, DEBUG_TYPE,
"WebAssembly Exception Information", true, true)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
INITIALIZE_PASS_END(WebAssemblyExceptionInfo, DEBUG_TYPE,
"WebAssembly Exception Information", true, true)
@@ -45,7 +45,7 @@ bool WebAssemblyExceptionInfo::runOnMachineFunction(MachineFunction &MF) {
ExceptionHandling::Wasm ||
!MF.getFunction().hasPersonalityFn())
return false;
- auto &MDT = getAnalysis<MachineDominatorTree>();
+ auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
auto &MDF = getAnalysis<MachineDominanceFrontier>();
recalculate(MF, MDT, MDF);
LLVM_DEBUG(dump());
@@ -207,12 +207,12 @@ void WebAssemblyExceptionInfo::recalculate(
auto *SrcWE = P.first;
auto *DstWE = P.second;
- for (auto *MBB : SrcWE->getBlocksSet()) {
+ SrcWE->getBlocksSet().remove_if([&](MachineBasicBlock *MBB){
if (MBB->isEHPad()) {
assert(!isReachableAmongDominated(DstWE->getEHPad(), MBB,
SrcWE->getEHPad(), MDT) &&
"We already handled EH pads above");
- continue;
+ return false;
}
if (isReachableAmongDominated(DstWE->getEHPad(), MBB, SrcWE->getEHPad(),
MDT)) {
@@ -227,15 +227,16 @@ void WebAssemblyExceptionInfo::recalculate(
InnerWE->removeFromBlocksSet(MBB);
InnerWE = InnerWE->getParentException();
}
- SrcWE->removeFromBlocksSet(MBB);
LLVM_DEBUG(dbgs() << " removed from " << SrcWE->getEHPad()->getNumber()
<< "." << SrcWE->getEHPad()->getName()
<< "'s exception\n");
changeExceptionFor(MBB, SrcWE->getParentException());
if (SrcWE->getParentException())
SrcWE->getParentException()->addToBlocksSet(MBB);
+ return true;
}
- }
+ return false;
+ });
}
// Add BBs to exceptions' block vector
@@ -273,7 +274,7 @@ void WebAssemblyExceptionInfo::releaseMemory() {
void WebAssemblyExceptionInfo::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineDominanceFrontier>();
MachineFunctionPass::getAnalysisUsage(AU);
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 4623ce9b5c38..46bd5e42a9d5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -351,17 +351,17 @@ multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, string inst_32,
defm : BinRMWPat<i64, rmw_64, inst_64>;
}
-defm : BinRMWPattern<atomic_load_add_32, atomic_load_add_64,
+defm : BinRMWPattern<atomic_load_add_i32, atomic_load_add_i64,
"ATOMIC_RMW_ADD_I32", "ATOMIC_RMW_ADD_I64">;
-defm : BinRMWPattern<atomic_load_sub_32, atomic_load_sub_64,
+defm : BinRMWPattern<atomic_load_sub_i32, atomic_load_sub_i64,
"ATOMIC_RMW_SUB_I32", "ATOMIC_RMW_SUB_I64">;
-defm : BinRMWPattern<atomic_load_and_32, atomic_load_and_64,
+defm : BinRMWPattern<atomic_load_and_i32, atomic_load_and_i64,
"ATOMIC_RMW_AND_I32", "ATOMIC_RMW_AND_I64">;
-defm : BinRMWPattern<atomic_load_or_32, atomic_load_or_64,
+defm : BinRMWPattern<atomic_load_or_i32, atomic_load_or_i64,
"ATOMIC_RMW_OR_I32", "ATOMIC_RMW_OR_I64">;
-defm : BinRMWPattern<atomic_load_xor_32, atomic_load_xor_64,
+defm : BinRMWPattern<atomic_load_xor_i32, atomic_load_xor_i64,
"ATOMIC_RMW_XOR_I32", "ATOMIC_RMW_XOR_I64">;
-defm : BinRMWPattern<atomic_swap_32, atomic_swap_64,
+defm : BinRMWPattern<atomic_swap_i32, atomic_swap_i64,
"ATOMIC_RMW_XCHG_I32", "ATOMIC_RMW_XCHG_I64">;
// Truncating & zero-extending binary RMW patterns.
@@ -408,27 +408,27 @@ multiclass BinRMWTruncExtPattern<
}
defm : BinRMWTruncExtPattern<
- atomic_load_add_8, atomic_load_add_16, atomic_load_add_32,
+ atomic_load_add_i8, atomic_load_add_i16, atomic_load_add_i32,
"ATOMIC_RMW8_U_ADD_I32", "ATOMIC_RMW16_U_ADD_I32",
"ATOMIC_RMW8_U_ADD_I64", "ATOMIC_RMW16_U_ADD_I64", "ATOMIC_RMW32_U_ADD_I64">;
defm : BinRMWTruncExtPattern<
- atomic_load_sub_8, atomic_load_sub_16, atomic_load_sub_32,
+ atomic_load_sub_i8, atomic_load_sub_i16, atomic_load_sub_i32,
"ATOMIC_RMW8_U_SUB_I32", "ATOMIC_RMW16_U_SUB_I32",
"ATOMIC_RMW8_U_SUB_I64", "ATOMIC_RMW16_U_SUB_I64", "ATOMIC_RMW32_U_SUB_I64">;
defm : BinRMWTruncExtPattern<
- atomic_load_and_8, atomic_load_and_16, atomic_load_and_32,
+ atomic_load_and_i8, atomic_load_and_i16, atomic_load_and_i32,
"ATOMIC_RMW8_U_AND_I32", "ATOMIC_RMW16_U_AND_I32",
"ATOMIC_RMW8_U_AND_I64", "ATOMIC_RMW16_U_AND_I64", "ATOMIC_RMW32_U_AND_I64">;
defm : BinRMWTruncExtPattern<
- atomic_load_or_8, atomic_load_or_16, atomic_load_or_32,
+ atomic_load_or_i8, atomic_load_or_i16, atomic_load_or_i32,
"ATOMIC_RMW8_U_OR_I32", "ATOMIC_RMW16_U_OR_I32",
"ATOMIC_RMW8_U_OR_I64", "ATOMIC_RMW16_U_OR_I64", "ATOMIC_RMW32_U_OR_I64">;
defm : BinRMWTruncExtPattern<
- atomic_load_xor_8, atomic_load_xor_16, atomic_load_xor_32,
+ atomic_load_xor_i8, atomic_load_xor_i16, atomic_load_xor_i32,
"ATOMIC_RMW8_U_XOR_I32", "ATOMIC_RMW16_U_XOR_I32",
"ATOMIC_RMW8_U_XOR_I64", "ATOMIC_RMW16_U_XOR_I64", "ATOMIC_RMW32_U_XOR_I64">;
defm : BinRMWTruncExtPattern<
- atomic_swap_8, atomic_swap_16, atomic_swap_32,
+ atomic_swap_i8, atomic_swap_i16, atomic_swap_i32,
"ATOMIC_RMW8_U_XCHG_I32", "ATOMIC_RMW16_U_XCHG_I32",
"ATOMIC_RMW8_U_XCHG_I64", "ATOMIC_RMW16_U_XCHG_I64",
"ATOMIC_RMW32_U_XCHG_I64">;
@@ -485,8 +485,8 @@ multiclass TerRMWPat<ValueType ty, PatFrag kind, string inst> {
Requires<[HasAddr64, HasAtomics]>;
}
-defm : TerRMWPat<i32, atomic_cmp_swap_32, "ATOMIC_RMW_CMPXCHG_I32">;
-defm : TerRMWPat<i64, atomic_cmp_swap_64, "ATOMIC_RMW_CMPXCHG_I64">;
+defm : TerRMWPat<i32, atomic_cmp_swap_i32, "ATOMIC_RMW_CMPXCHG_I32">;
+defm : TerRMWPat<i64, atomic_cmp_swap_i64, "ATOMIC_RMW_CMPXCHG_I64">;
// Truncating & zero-extending ternary RMW patterns.
// DAG legalization & optimization before instruction selection may introduce
@@ -524,13 +524,13 @@ class sext_ter_rmw_8_64<PatFrag kind> :
class sext_ter_rmw_16_64<PatFrag kind> : sext_ter_rmw_8_64<kind>;
// 32->64 sext RMW gets selected as i32.atomic.rmw.***, i64.extend_i32_s
-defm : TerRMWPat<i32, zext_ter_rmw_8_32<atomic_cmp_swap_8>, "ATOMIC_RMW8_U_CMPXCHG_I32">;
-defm : TerRMWPat<i32, zext_ter_rmw_16_32<atomic_cmp_swap_16>, "ATOMIC_RMW16_U_CMPXCHG_I32">;
-defm : TerRMWPat<i64, zext_ter_rmw_8_64<atomic_cmp_swap_8>, "ATOMIC_RMW8_U_CMPXCHG_I64">;
-defm : TerRMWPat<i64, zext_ter_rmw_16_64<atomic_cmp_swap_16>, "ATOMIC_RMW16_U_CMPXCHG_I64">;
-defm : TerRMWPat<i64, zext_ter_rmw_32_64<atomic_cmp_swap_32>, "ATOMIC_RMW32_U_CMPXCHG_I64">;
+defm : TerRMWPat<i32, zext_ter_rmw_8_32<atomic_cmp_swap_i8>, "ATOMIC_RMW8_U_CMPXCHG_I32">;
+defm : TerRMWPat<i32, zext_ter_rmw_16_32<atomic_cmp_swap_i16>, "ATOMIC_RMW16_U_CMPXCHG_I32">;
+defm : TerRMWPat<i64, zext_ter_rmw_8_64<atomic_cmp_swap_i8>, "ATOMIC_RMW8_U_CMPXCHG_I64">;
+defm : TerRMWPat<i64, zext_ter_rmw_16_64<atomic_cmp_swap_i16>, "ATOMIC_RMW16_U_CMPXCHG_I64">;
+defm : TerRMWPat<i64, zext_ter_rmw_32_64<atomic_cmp_swap_i32>, "ATOMIC_RMW32_U_CMPXCHG_I64">;
-defm : TerRMWPat<i32, sext_ter_rmw_8_32<atomic_cmp_swap_8>, "ATOMIC_RMW8_U_CMPXCHG_I32">;
-defm : TerRMWPat<i32, sext_ter_rmw_16_32<atomic_cmp_swap_16>, "ATOMIC_RMW16_U_CMPXCHG_I32">;
-defm : TerRMWPat<i64, sext_ter_rmw_8_64<atomic_cmp_swap_8>, "ATOMIC_RMW8_U_CMPXCHG_I64">;
-defm : TerRMWPat<i64, sext_ter_rmw_16_64<atomic_cmp_swap_16>, "ATOMIC_RMW16_U_CMPXCHG_I64">;
+defm : TerRMWPat<i32, sext_ter_rmw_8_32<atomic_cmp_swap_i8>, "ATOMIC_RMW8_U_CMPXCHG_I32">;
+defm : TerRMWPat<i32, sext_ter_rmw_16_32<atomic_cmp_swap_i16>, "ATOMIC_RMW16_U_CMPXCHG_I32">;
+defm : TerRMWPat<i64, sext_ter_rmw_8_64<atomic_cmp_swap_i8>, "ATOMIC_RMW8_U_CMPXCHG_I64">;
+defm : TerRMWPat<i64, sext_ter_rmw_16_64<atomic_cmp_swap_i16>, "ATOMIC_RMW16_U_CMPXCHG_I64">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 3c97befcea1a..2ee430c88169 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1320,16 +1320,23 @@ def : Pat<(v8f16 (int_wasm_pmax (v8f16 V128:$lhs), (v8f16 V128:$rhs))),
//===----------------------------------------------------------------------===//
multiclass SIMDConvert<Vec vec, Vec arg, SDPatternOperator op, string name,
- bits<32> simdop> {
+ bits<32> simdop, list<Predicate> reqs = []> {
defm op#_#vec :
SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
[(set (vec.vt V128:$dst), (vec.vt (op (arg.vt V128:$vec))))],
- vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop>;
+ vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop, reqs>;
+}
+
+multiclass HalfPrecisionConvert<Vec vec, Vec arg, SDPatternOperator op,
+ string name, bits<32> simdop> {
+ defm "" : SIMDConvert<vec, arg, op, name, simdop, [HasHalfPrecision]>;
}
// Floating point to integer with saturation: trunc_sat
defm "" : SIMDConvert<I32x4, F32x4, fp_to_sint, "trunc_sat_f32x4_s", 248>;
defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>;
+defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_sint, "trunc_sat_f16x8_s", 0x148>;
+defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_uint, "trunc_sat_f16x8_u", 0x149>;
// Support the saturating variety as well.
def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i32)>;
@@ -1355,6 +1362,8 @@ defm "" : SIMDConvert<F32x4, I32x4, sint_to_fp, "convert_i32x4_s", 250>;
defm "" : SIMDConvert<F32x4, I32x4, uint_to_fp, "convert_i32x4_u", 251>;
defm "" : SIMDConvert<F64x2, I32x4, convert_low_s, "convert_low_i32x4_s", 0xfe>;
defm "" : SIMDConvert<F64x2, I32x4, convert_low_u, "convert_low_i32x4_u", 0xff>;
+defm "" : HalfPrecisionConvert<F16x8, I16x8, sint_to_fp, "convert_i16x8_s", 0x14a>;
+defm "" : HalfPrecisionConvert<F16x8, I16x8, uint_to_fp, "convert_i16x8_u", 0x14b>;
// Extending operations
// TODO: refactor this to be uniform for i64x2 if the numbering is not changed.
@@ -1480,23 +1489,24 @@ defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_unsigned_zero,
// Relaxed (Negative) Multiply-Add (madd/nmadd)
//===----------------------------------------------------------------------===//
-multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS> {
+multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> reqs> {
defm MADD_#vec :
- RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
- [(set (vec.vt V128:$dst), (int_wasm_relaxed_madd
- (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
- vec.prefix#".relaxed_madd\t$dst, $a, $b, $c",
- vec.prefix#".relaxed_madd", simdopA>;
+ SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+ [(set (vec.vt V128:$dst), (int_wasm_relaxed_madd
+ (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+ vec.prefix#".relaxed_madd\t$dst, $a, $b, $c",
+ vec.prefix#".relaxed_madd", simdopA, reqs>;
defm NMADD_#vec :
- RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
- [(set (vec.vt V128:$dst), (int_wasm_relaxed_nmadd
- (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
- vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c",
- vec.prefix#".relaxed_nmadd", simdopS>;
+ SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+ [(set (vec.vt V128:$dst), (int_wasm_relaxed_nmadd
+ (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+ vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c",
+ vec.prefix#".relaxed_nmadd", simdopS, reqs>;
}
-defm "" : SIMDMADD<F32x4, 0x105, 0x106>;
-defm "" : SIMDMADD<F64x2, 0x107, 0x108>;
+defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
+defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
+defm "" : SIMDMADD<F16x8, 0x146, 0x147, [HasHalfPrecision]>;
//===----------------------------------------------------------------------===//
// Laneselect
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
index 2180f57c106a..2ab5bcdd838d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
@@ -56,8 +56,8 @@ public:
AU.setPreservesCFG();
AU.addRequired<MachineBlockFrequencyInfo>();
AU.addPreserved<MachineBlockFrequencyInfo>();
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
@@ -180,7 +180,7 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) {
});
MachineRegisterInfo &MRI = MF.getRegInfo();
- auto &MDT = getAnalysis<MachineDominatorTree>();
+ auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
const WebAssemblyTargetLowering &TLI =
*MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
const auto &LibInfo =
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index d4edb6bf18d9..e38905c20b83 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -48,13 +48,13 @@ class WebAssemblyRegStackify final : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<LiveIntervals>();
AU.addPreserved<MachineBlockFrequencyInfo>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
AU.addPreservedID(LiveVariablesID);
- AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -813,7 +813,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
- auto &MDT = getAnalysis<MachineDominatorTree>();
+ auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
auto &LIS = getAnalysis<LiveIntervals>();
// Walk the instructions from the bottom up. Currently we don't look past
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index d9936557776b..20e50c8c9e1a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -201,6 +201,9 @@ struct RuntimeLibcallSignatureTable {
Table[RTLIB::COS_F32] = f32_func_f32;
Table[RTLIB::COS_F64] = f64_func_f64;
Table[RTLIB::COS_F128] = i64_i64_func_i64_i64;
+ Table[RTLIB::TAN_F32] = f32_func_f32;
+ Table[RTLIB::TAN_F64] = f64_func_f64;
+ Table[RTLIB::TAN_F128] = i64_i64_func_i64_i64;
Table[RTLIB::SINCOS_F32] = func_f32_iPTR_iPTR;
Table[RTLIB::SINCOS_F64] = func_f64_iPTR_iPTR;
Table[RTLIB::SINCOS_F128] = func_i64_i64_iPTR_iPTR;
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 662310610931..dbea42d55b5f 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -88,15 +88,17 @@ class X86AsmParser : public MCTargetAsmParser {
bool Code16GCC;
unsigned ForcedDataPrefix = 0;
- enum VEXEncoding {
- VEXEncoding_Default,
- VEXEncoding_VEX,
- VEXEncoding_VEX2,
- VEXEncoding_VEX3,
- VEXEncoding_EVEX,
+ enum OpcodePrefix {
+ OpcodePrefix_Default,
+ OpcodePrefix_REX,
+ OpcodePrefix_REX2,
+ OpcodePrefix_VEX,
+ OpcodePrefix_VEX2,
+ OpcodePrefix_VEX3,
+ OpcodePrefix_EVEX,
};
- VEXEncoding ForcedVEXEncoding = VEXEncoding_Default;
+ OpcodePrefix ForcedOpcodePrefix = OpcodePrefix_Default;
enum DispEncoding {
DispEncoding_Default,
@@ -1197,12 +1199,11 @@ private:
bool ErrorMissingFeature(SMLoc IDLoc, const FeatureBitset &MissingFeatures,
bool MatchingInlineAsm);
- bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+ bool matchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst &Inst,
OperandVector &Operands, MCStreamer &Out,
- uint64_t &ErrorInfo,
- bool MatchingInlineAsm);
+ uint64_t &ErrorInfo, bool MatchingInlineAsm);
- bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+ bool matchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst &Inst,
OperandVector &Operands, MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm);
@@ -2317,7 +2318,7 @@ bool X86AsmParser::parseCFlagsOp(OperandVector &Operands) {
return Error(Tok.getLoc(), "Expected { at this point");
Parser.Lex(); // Eat "{"
Tok = Parser.getTok();
- if (Tok.getIdentifier() != "dfv")
+ if (Tok.getIdentifier().lower() != "dfv")
return Error(Tok.getLoc(), "Expected dfv at this point");
Parser.Lex(); // Eat "dfv"
Tok = Parser.getTok();
@@ -2337,7 +2338,7 @@ bool X86AsmParser::parseCFlagsOp(OperandVector &Operands) {
unsigned CFlags = 0;
for (unsigned I = 0; I < 4; ++I) {
Tok = Parser.getTok();
- unsigned CFlag = StringSwitch<unsigned>(Tok.getIdentifier())
+ unsigned CFlag = StringSwitch<unsigned>(Tok.getIdentifier().lower())
.Case("of", 0x8)
.Case("sf", 0x4)
.Case("zf", 0x2)
@@ -3186,7 +3187,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
InstInfo = &Info;
// Reset the forced VEX encoding.
- ForcedVEXEncoding = VEXEncoding_Default;
+ ForcedOpcodePrefix = OpcodePrefix_Default;
ForcedDispEncoding = DispEncoding_Default;
UseApxExtendedReg = false;
ForcedNoFlag = false;
@@ -3202,14 +3203,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return Error(Parser.getTok().getLoc(), "Expected '}'");
Parser.Lex(); // Eat curly.
- if (Prefix == "vex")
- ForcedVEXEncoding = VEXEncoding_VEX;
+ if (Prefix == "rex")
+ ForcedOpcodePrefix = OpcodePrefix_REX;
+ else if (Prefix == "rex2")
+ ForcedOpcodePrefix = OpcodePrefix_REX2;
+ else if (Prefix == "vex")
+ ForcedOpcodePrefix = OpcodePrefix_VEX;
else if (Prefix == "vex2")
- ForcedVEXEncoding = VEXEncoding_VEX2;
+ ForcedOpcodePrefix = OpcodePrefix_VEX2;
else if (Prefix == "vex3")
- ForcedVEXEncoding = VEXEncoding_VEX3;
+ ForcedOpcodePrefix = OpcodePrefix_VEX3;
else if (Prefix == "evex")
- ForcedVEXEncoding = VEXEncoding_EVEX;
+ ForcedOpcodePrefix = OpcodePrefix_EVEX;
else if (Prefix == "disp8")
ForcedDispEncoding = DispEncoding_Disp8;
else if (Prefix == "disp32")
@@ -3235,15 +3240,15 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Parse MASM style pseudo prefixes.
if (isParsingMSInlineAsm()) {
if (Name.equals_insensitive("vex"))
- ForcedVEXEncoding = VEXEncoding_VEX;
+ ForcedOpcodePrefix = OpcodePrefix_VEX;
else if (Name.equals_insensitive("vex2"))
- ForcedVEXEncoding = VEXEncoding_VEX2;
+ ForcedOpcodePrefix = OpcodePrefix_VEX2;
else if (Name.equals_insensitive("vex3"))
- ForcedVEXEncoding = VEXEncoding_VEX3;
+ ForcedOpcodePrefix = OpcodePrefix_VEX3;
else if (Name.equals_insensitive("evex"))
- ForcedVEXEncoding = VEXEncoding_EVEX;
+ ForcedOpcodePrefix = OpcodePrefix_EVEX;
- if (ForcedVEXEncoding != VEXEncoding_Default) {
+ if (ForcedOpcodePrefix != OpcodePrefix_Default) {
if (getLexer().isNot(AsmToken::Identifier))
return Error(Parser.getTok().getLoc(), "Expected identifier");
// FIXME: The mnemonic won't match correctly if its not in lower case.
@@ -3741,7 +3746,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
}
bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
- if (ForcedVEXEncoding != VEXEncoding_VEX3 &&
+ if (ForcedOpcodePrefix != OpcodePrefix_VEX3 &&
X86::optimizeInstFromVEX3ToVEX2(Inst, MII.get(Inst.getOpcode())))
return true;
@@ -4002,15 +4007,59 @@ void X86AsmParser::emitInstruction(MCInst &Inst, OperandVector &Operands,
applyLVILoadHardeningMitigation(Inst, Out);
}
+static unsigned getPrefixes(OperandVector &Operands) {
+ unsigned Result = 0;
+ X86Operand &Prefix = static_cast<X86Operand &>(*Operands.back());
+ if (Prefix.isPrefix()) {
+ Result = Prefix.getPrefix();
+ Operands.pop_back();
+ }
+ return Result;
+}
+
bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out, uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
- if (isParsingIntelSyntax())
- return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
- MatchingInlineAsm);
- return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
- MatchingInlineAsm);
+ assert(!Operands.empty() && "Unexpect empty operand list!");
+ assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!");
+
+ // First, handle aliases that expand to multiple instructions.
+ MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands,
+ Out, MatchingInlineAsm);
+ unsigned Prefixes = getPrefixes(Operands);
+
+ MCInst Inst;
+
+ // If REX/REX2/VEX/EVEX encoding is forced, we need to pass the USE_* flag to
+ // the encoder and printer.
+ if (ForcedOpcodePrefix == OpcodePrefix_REX)
+ Prefixes |= X86::IP_USE_REX;
+ else if (ForcedOpcodePrefix == OpcodePrefix_REX2)
+ Prefixes |= X86::IP_USE_REX2;
+ else if (ForcedOpcodePrefix == OpcodePrefix_VEX)
+ Prefixes |= X86::IP_USE_VEX;
+ else if (ForcedOpcodePrefix == OpcodePrefix_VEX2)
+ Prefixes |= X86::IP_USE_VEX2;
+ else if (ForcedOpcodePrefix == OpcodePrefix_VEX3)
+ Prefixes |= X86::IP_USE_VEX3;
+ else if (ForcedOpcodePrefix == OpcodePrefix_EVEX)
+ Prefixes |= X86::IP_USE_EVEX;
+
+ // Set encoded flags for {disp8} and {disp32}.
+ if (ForcedDispEncoding == DispEncoding_Disp8)
+ Prefixes |= X86::IP_USE_DISP8;
+ else if (ForcedDispEncoding == DispEncoding_Disp32)
+ Prefixes |= X86::IP_USE_DISP32;
+
+ if (Prefixes)
+ Inst.setFlags(Prefixes);
+
+ return isParsingIntelSyntax()
+ ? matchAndEmitIntelInstruction(IDLoc, Opcode, Inst, Operands, Out,
+ ErrorInfo, MatchingInlineAsm)
+ : matchAndEmitATTInstruction(IDLoc, Opcode, Inst, Operands, Out,
+ ErrorInfo, MatchingInlineAsm);
}
void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
@@ -4053,82 +4102,50 @@ bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc,
return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm);
}
-static unsigned getPrefixes(OperandVector &Operands) {
- unsigned Result = 0;
- X86Operand &Prefix = static_cast<X86Operand &>(*Operands.back());
- if (Prefix.isPrefix()) {
- Result = Prefix.getPrefix();
- Operands.pop_back();
- }
- return Result;
-}
-
unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
unsigned Opc = Inst.getOpcode();
const MCInstrDesc &MCID = MII.get(Opc);
+ uint64_t TSFlags = MCID.TSFlags;
if (UseApxExtendedReg && !X86II::canUseApxExtendedReg(MCID))
return Match_Unsupported;
- if (ForcedNoFlag == !(MCID.TSFlags & X86II::EVEX_NF) && !X86::isCFCMOVCC(Opc))
+ if (ForcedNoFlag == !(TSFlags & X86II::EVEX_NF) && !X86::isCFCMOVCC(Opc))
return Match_Unsupported;
- if (ForcedVEXEncoding == VEXEncoding_EVEX &&
- (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX)
- return Match_Unsupported;
-
- if ((ForcedVEXEncoding == VEXEncoding_VEX ||
- ForcedVEXEncoding == VEXEncoding_VEX2 ||
- ForcedVEXEncoding == VEXEncoding_VEX3) &&
- (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
- return Match_Unsupported;
+ switch (ForcedOpcodePrefix) {
+ case OpcodePrefix_Default:
+ break;
+ case OpcodePrefix_REX:
+ case OpcodePrefix_REX2:
+ if (TSFlags & X86II::EncodingMask)
+ return Match_Unsupported;
+ break;
+ case OpcodePrefix_VEX:
+ case OpcodePrefix_VEX2:
+ case OpcodePrefix_VEX3:
+ if ((TSFlags & X86II::EncodingMask) != X86II::VEX)
+ return Match_Unsupported;
+ break;
+ case OpcodePrefix_EVEX:
+ if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
+ return Match_Unsupported;
+ break;
+ }
- if ((MCID.TSFlags & X86II::ExplicitOpPrefixMask) ==
- X86II::ExplicitVEXPrefix &&
- (ForcedVEXEncoding != VEXEncoding_VEX &&
- ForcedVEXEncoding != VEXEncoding_VEX2 &&
- ForcedVEXEncoding != VEXEncoding_VEX3))
+ if ((TSFlags & X86II::ExplicitOpPrefixMask) == X86II::ExplicitVEXPrefix &&
+ (ForcedOpcodePrefix != OpcodePrefix_VEX &&
+ ForcedOpcodePrefix != OpcodePrefix_VEX2 &&
+ ForcedOpcodePrefix != OpcodePrefix_VEX3))
return Match_Unsupported;
return Match_Success;
}
-bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
- OperandVector &Operands,
- MCStreamer &Out,
- uint64_t &ErrorInfo,
- bool MatchingInlineAsm) {
- assert(!Operands.empty() && "Unexpect empty operand list!");
- assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!");
- SMRange EmptyRange = std::nullopt;
-
- // First, handle aliases that expand to multiple instructions.
- MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands,
- Out, MatchingInlineAsm);
+bool X86AsmParser::matchAndEmitATTInstruction(
+ SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands,
+ MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) {
X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
- unsigned Prefixes = getPrefixes(Operands);
-
- MCInst Inst;
-
- // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
- // encoder and printer.
- if (ForcedVEXEncoding == VEXEncoding_VEX)
- Prefixes |= X86::IP_USE_VEX;
- else if (ForcedVEXEncoding == VEXEncoding_VEX2)
- Prefixes |= X86::IP_USE_VEX2;
- else if (ForcedVEXEncoding == VEXEncoding_VEX3)
- Prefixes |= X86::IP_USE_VEX3;
- else if (ForcedVEXEncoding == VEXEncoding_EVEX)
- Prefixes |= X86::IP_USE_EVEX;
-
- // Set encoded flags for {disp8} and {disp32}.
- if (ForcedDispEncoding == DispEncoding_Disp8)
- Prefixes |= X86::IP_USE_DISP8;
- else if (ForcedDispEncoding == DispEncoding_Disp32)
- Prefixes |= X86::IP_USE_DISP32;
-
- if (Prefixes)
- Inst.setFlags(Prefixes);
-
+ SMRange EmptyRange = std::nullopt;
// In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
// when matching the instruction.
if (ForcedDataPrefix == X86::Is32Bit)
@@ -4350,44 +4367,11 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
return true;
}
-bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
- OperandVector &Operands,
- MCStreamer &Out,
- uint64_t &ErrorInfo,
- bool MatchingInlineAsm) {
- assert(!Operands.empty() && "Unexpect empty operand list!");
- assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!");
- StringRef Mnemonic = (static_cast<X86Operand &>(*Operands[0])).getToken();
- SMRange EmptyRange = std::nullopt;
- StringRef Base = (static_cast<X86Operand &>(*Operands[0])).getToken();
- unsigned Prefixes = getPrefixes(Operands);
-
- // First, handle aliases that expand to multiple instructions.
- MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands, Out, MatchingInlineAsm);
+bool X86AsmParser::matchAndEmitIntelInstruction(
+ SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands,
+ MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) {
X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
-
- MCInst Inst;
-
- // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
- // encoder and printer.
- if (ForcedVEXEncoding == VEXEncoding_VEX)
- Prefixes |= X86::IP_USE_VEX;
- else if (ForcedVEXEncoding == VEXEncoding_VEX2)
- Prefixes |= X86::IP_USE_VEX2;
- else if (ForcedVEXEncoding == VEXEncoding_VEX3)
- Prefixes |= X86::IP_USE_VEX3;
- else if (ForcedVEXEncoding == VEXEncoding_EVEX)
- Prefixes |= X86::IP_USE_EVEX;
-
- // Set encoded flags for {disp8} and {disp32}.
- if (ForcedDispEncoding == DispEncoding_Disp8)
- Prefixes |= X86::IP_USE_DISP8;
- else if (ForcedDispEncoding == DispEncoding_Disp32)
- Prefixes |= X86::IP_USE_DISP32;
-
- if (Prefixes)
- Inst.setFlags(Prefixes);
-
+ SMRange EmptyRange = std::nullopt;
// Find one unsized memory operand, if present.
X86Operand *UnsizedMemOp = nullptr;
for (const auto &Op : Operands) {
@@ -4402,6 +4386,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// Allow some instructions to have implicitly pointer-sized operands. This is
// compatible with gas.
+ StringRef Mnemonic = (static_cast<X86Operand &>(*Operands[0])).getToken();
if (UnsizedMemOp) {
static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"};
for (const char *Instr : PtrSizedInstrs) {
@@ -4415,6 +4400,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
SmallVector<unsigned, 8> Match;
FeatureBitset ErrorInfoMissingFeatures;
FeatureBitset MissingFeatures;
+ StringRef Base = (static_cast<X86Operand &>(*Operands[0])).getToken();
// If unsized push has immediate operand we should default the default pointer
// size for the size.
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 0ff440bdbe0d..6272e2d270f2 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -284,7 +284,10 @@ static int readPrefixes(struct InternalInstruction *insn) {
// it's not mandatory prefix
// 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need
// 0x0f exactly after it to be mandatory prefix
- if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66)
+ // 4. if (nextByte == 0xd5) it's REX2 and we need
+ // 0x0f exactly after it to be mandatory prefix
+ if (isREX(insn, nextByte) || isREX2(insn, nextByte) || nextByte == 0x0f ||
+ nextByte == 0x66)
// The last of 0xf2 /0xf3 is mandatory prefix
insn->mandatoryPrefix = byte;
insn->repeatPrefix = byte;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 472f34a4efdb..c2188d206b5f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -125,9 +125,10 @@ class X86AsmBackend : public MCAsmBackend {
unsigned TargetPrefixMax = 0;
MCInst PrevInst;
+ unsigned PrevInstOpcode = 0;
MCBoundaryAlignFragment *PendingBA = nullptr;
std::pair<MCFragment *, size_t> PrevInstPosition;
- bool CanPadInst = false;
+ bool IsRightAfterData = false;
uint8_t determinePaddingPrefix(const MCInst &Inst) const;
bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
@@ -267,8 +268,8 @@ static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) {
}
/// Check if the instruction is a prefix.
-static bool isPrefix(const MCInst &MI, const MCInstrInfo &MCII) {
- return X86II::isPrefix(MCII.get(MI.getOpcode()).TSFlags);
+static bool isPrefix(unsigned Opcode, const MCInstrInfo &MCII) {
+ return X86II::isPrefix(MCII.get(Opcode).TSFlags);
}
/// Check if the instruction is valid as the first instruction in macro fusion.
@@ -382,9 +383,9 @@ bool X86AsmBackend::allowEnhancedRelaxation() const {
/// X86 has certain instructions which enable interrupts exactly one
/// instruction *after* the instruction which stores to SS. Return true if the
-/// given instruction has such an interrupt delay slot.
-static bool hasInterruptDelaySlot(const MCInst &Inst) {
- switch (Inst.getOpcode()) {
+/// given instruction may have such an interrupt delay slot.
+static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) {
+ switch (InstOpcode) {
case X86::POPSS16:
case X86::POPSS32:
case X86::STI:
@@ -394,9 +395,9 @@ static bool hasInterruptDelaySlot(const MCInst &Inst) {
case X86::MOV32sr:
case X86::MOV64sr:
case X86::MOV16sm:
- if (Inst.getOperand(0).getReg() == X86::SS)
- return true;
- break;
+ // In fact, this is only the case if the first operand is SS. However, as
+ // segment moves occur extremely rarely, this is just a minor pessimization.
+ return true;
}
return false;
}
@@ -406,16 +407,10 @@ static bool
isRightAfterData(MCFragment *CurrentFragment,
const std::pair<MCFragment *, size_t> &PrevInstPosition) {
MCFragment *F = CurrentFragment;
- // Empty data fragments may be created to prevent further data being
- // added into the previous fragment, we need to skip them since they
- // have no contents.
- for (; isa_and_nonnull<MCDataFragment>(F); F = F->getPrevNode())
- if (cast<MCDataFragment>(F)->getContents().size() != 0)
- break;
-
// Since data is always emitted into a DataFragment, our check strategy is
// simple here.
// - If the fragment is a DataFragment
+ // - If it's empty (section start or data after align), return false.
// - If it's not the fragment where the previous instruction is,
// returns true.
// - If it's the fragment holding the previous instruction but its
@@ -424,8 +419,9 @@ isRightAfterData(MCFragment *CurrentFragment,
// - Otherwise returns false.
// - If the fragment is not a DataFragment, returns false.
if (auto *DF = dyn_cast_or_null<MCDataFragment>(F))
- return DF != PrevInstPosition.first ||
- DF->getContents().size() != PrevInstPosition.second;
+ return DF->getContents().size() &&
+ (DF != PrevInstPosition.first ||
+ DF->getContents().size() != PrevInstPosition.second);
return false;
}
@@ -455,22 +451,22 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
// TLSCALL).
return false;
- if (hasInterruptDelaySlot(PrevInst))
+ if (mayHaveInterruptDelaySlot(PrevInstOpcode))
// If this instruction follows an interrupt enabling instruction with a one
// instruction delay, inserting a nop would change behavior.
return false;
- if (isPrefix(PrevInst, *MCII))
+ if (isPrefix(PrevInstOpcode, *MCII))
// If this instruction follows a prefix, inserting a nop/prefix would change
// semantic.
return false;
- if (isPrefix(Inst, *MCII))
+ if (isPrefix(Inst.getOpcode(), *MCII))
// If this instruction is a prefix, inserting a prefix would change
// semantic.
return false;
- if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
+ if (IsRightAfterData)
// If this instruction follows any data, there is no clear
// instruction boundary, inserting a nop/prefix would change semantic.
return false;
@@ -484,7 +480,7 @@ bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const {
assert(allowAutoPadding() && "incorrect initialization!");
// We only pad in text section.
- if (!OS.getCurrentSectionOnly()->getKind().isText())
+ if (!OS.getCurrentSectionOnly()->isText())
return false;
// To be Done: Currently don't deal with Bundle cases.
@@ -514,19 +510,27 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const {
/// Insert BoundaryAlignFragment before instructions to align branches.
void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
const MCInst &Inst, const MCSubtargetInfo &STI) {
- CanPadInst = canPadInst(Inst, OS);
+ // Used by canPadInst. Done here, because in emitInstructionEnd, the current
+ // fragment will have changed.
+ IsRightAfterData =
+ isRightAfterData(OS.getCurrentFragment(), PrevInstPosition);
if (!canPadBranches(OS))
return;
+ // NB: PrevInst only valid if canPadBranches is true.
if (!isMacroFused(PrevInst, Inst))
// Macro fusion doesn't happen indeed, clear the pending.
PendingBA = nullptr;
- if (!CanPadInst)
+ // When branch padding is enabled (basically the skx102 erratum => unlikely),
+ // we call canPadInst (not cheap) twice. However, in the common case, we can
+ // avoid unnecessary calls to that, as this is otherwise only used for
+ // relaxable fragments.
+ if (!canPadInst(Inst, OS))
return;
- if (PendingBA && OS.getCurrentFragment()->getPrevNode() == PendingBA) {
+ if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
// Macro fusion actually happens and there is no other fragment inserted
// after the previous instruction.
//
@@ -552,21 +556,29 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
isFirstMacroFusibleInst(Inst, *MCII))) {
// If we meet a unfused branch or the first instuction in a fusiable pair,
// insert a BoundaryAlign fragment.
- OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary, STI));
+ PendingBA = OS.getContext().allocFragment<MCBoundaryAlignFragment>(
+ AlignBoundary, STI);
+ OS.insert(PendingBA);
}
}
/// Set the last fragment to be aligned for the BoundaryAlignFragment.
-void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) {
- PrevInst = Inst;
+void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
+ const MCInst &Inst) {
MCFragment *CF = OS.getCurrentFragment();
- PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF))
- F->setAllowAutoPadding(CanPadInst);
+ F->setAllowAutoPadding(canPadInst(Inst, OS));
+
+ // Update PrevInstOpcode here, canPadInst() reads that.
+ PrevInstOpcode = Inst.getOpcode();
+ PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
if (!canPadBranches(OS))
return;
+ // PrevInst is only needed if canPadBranches. Copying an MCInst isn't cheap.
+ PrevInst = Inst;
+
if (!needAlign(Inst) || !PendingBA)
return;
@@ -579,7 +591,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst)
// MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
// DataFragment.
if (isa_and_nonnull<MCDataFragment>(CF))
- OS.insert(new MCDataFragment());
+ OS.insert(OS.getContext().allocFragment<MCDataFragment>());
// Update the maximum alignment on the current section if necessary.
MCSection *Sec = OS.getCurrentSectionOnly();
@@ -866,7 +878,7 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
LabeledFragments.insert(S.getFragment(false));
for (MCSection &Sec : Asm) {
- if (!Sec.getKind().isText())
+ if (!Sec.isText())
continue;
SmallVector<MCRelaxableFragment *, 4> Relaxable;
@@ -968,8 +980,8 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
// The layout is done. Mark every fragment as valid.
for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
MCSection &Section = *Layout.getSectionOrder()[i];
- Layout.getFragmentOffset(&*Section.getFragmentList().rbegin());
- Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin());
+ Layout.getFragmentOffset(&*Section.curFragList()->Tail);
+ Asm.computeFragmentSize(Layout, *Section.curFragList()->Tail);
}
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 8e4015783641..a89408bb79b0 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -56,12 +56,14 @@ enum IPREFIXES {
IP_HAS_REPEAT = 1U << 3,
IP_HAS_LOCK = 1U << 4,
IP_HAS_NOTRACK = 1U << 5,
- IP_USE_VEX = 1U << 6,
- IP_USE_VEX2 = 1U << 7,
- IP_USE_VEX3 = 1U << 8,
- IP_USE_EVEX = 1U << 9,
- IP_USE_DISP8 = 1U << 10,
- IP_USE_DISP32 = 1U << 11,
+ IP_USE_REX = 1U << 6,
+ IP_USE_REX2 = 1U << 7,
+ IP_USE_VEX = 1U << 8,
+ IP_USE_VEX2 = 1U << 9,
+ IP_USE_VEX3 = 1U << 10,
+ IP_USE_EVEX = 1U << 11,
+ IP_USE_DISP8 = 1U << 12,
+ IP_USE_DISP32 = 1U << 13,
};
enum OperandType : unsigned {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index b4633b91bee3..72219c136c7e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1365,7 +1365,10 @@ PrefixKind X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
}
}
}
- if ((TSFlags & X86II::ExplicitOpPrefixMask) == X86II::ExplicitREX2Prefix)
+ if (MI.getFlags() & X86::IP_USE_REX)
+ Prefix.setLowerBound(REX);
+ if ((TSFlags & X86II::ExplicitOpPrefixMask) == X86II::ExplicitREX2Prefix ||
+ MI.getFlags() & X86::IP_USE_REX2)
Prefix.setLowerBound(REX2);
switch (TSFlags & X86II::FormMask) {
default:
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 628ff560017e..68b78c7c4477 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -346,6 +346,8 @@ def FeatureNF : SubtargetFeature<"nf", "HasNF", "true",
"Support status flags update suppression">;
def FeatureCF : SubtargetFeature<"cf", "HasCF", "true",
"Support conditional faulting">;
+def FeatureZU : SubtargetFeature<"zu", "HasZU", "true",
+ "Support zero-upper SETcc/IMUL">;
def FeatureUseGPR32InInlineAsm
: SubtargetFeature<"inline-asm-use-gpr32", "UseInlineAsmGPR32", "true",
"Enable use of GPR32 in inline assembly for APX">;
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index 9819bfd12985..d979517e12af 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -50,8 +50,7 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
} // namespace
-void X86TargetMachine::registerPassBuilderCallbacks(
- PassBuilder &PB, bool PopulateClassToPassNames) {
+void X86TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "X86PassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"
}
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 11b2155e3f98..7343af1bdc9a 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -174,29 +174,6 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
return true;
}
-static bool isRedundantNewDataDest(MachineInstr &MI, const X86Subtarget &ST) {
- // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
- // ->
- // $rbx = ADD64rr $rbx, $rax
- const MCInstrDesc &Desc = MI.getDesc();
- Register Reg0 = MI.getOperand(0).getReg();
- const MachineOperand &Op1 = MI.getOperand(1);
- if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 ||
- X86::isCFCMOVCC(MI.getOpcode()))
- return false;
- Register Reg1 = Op1.getReg();
- if (Reg1 == Reg0)
- return true;
-
- // Op1 and Op2 may be commutable for ND instructions.
- if (!Desc.isCommutable() || Desc.getNumOperands() < 3 ||
- !MI.getOperand(2).isReg() || MI.getOperand(2).getReg() != Reg0)
- return false;
- // Opcode may change after commute, e.g. SHRD -> SHLD
- ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2);
- return true;
-}
-
static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
uint64_t TSFlags = MI.getDesc().TSFlags;
@@ -208,6 +185,30 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
return false;
+ auto IsRedundantNewDataDest = [&](unsigned &Opc) {
+ // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
+ // ->
+ // $rbx = ADD64rr $rbx, $rax
+ const MCInstrDesc &Desc = MI.getDesc();
+ Register Reg0 = MI.getOperand(0).getReg();
+ const MachineOperand &Op1 = MI.getOperand(1);
+ if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 ||
+ X86::isCFCMOVCC(MI.getOpcode()))
+ return false;
+ Register Reg1 = Op1.getReg();
+ if (Reg1 == Reg0)
+ return true;
+
+ // Op1 and Op2 may be commutable for ND instructions.
+ if (!Desc.isCommutable() || Desc.getNumOperands() < 3 ||
+ !MI.getOperand(2).isReg() || MI.getOperand(2).getReg() != Reg0)
+ return false;
+ // Opcode may change after commute, e.g. SHRD -> SHLD
+ ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2);
+ Opc = MI.getOpcode();
+ return true;
+ };
+
// EVEX_B has several meanings.
// AVX512:
// register form: rounding control or SAE
@@ -218,40 +219,36 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
//
// For AVX512 cases, EVEX prefix is needed in order to carry this information
// thus preventing the transformation to VEX encoding.
- unsigned Opc = MI.getOpcode();
bool IsND = X86II::hasNewDataDest(TSFlags);
if (TSFlags & X86II::EVEX_B && !IsND)
return false;
+ unsigned Opc = MI.getOpcode();
// MOVBE*rr is special because it has semantic of NDD but not set EVEX_B.
bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr;
- bool IsRedundantNDD = IsNDLike ? isRedundantNewDataDest(MI, ST) : false;
- // NonNF -> NF only if it's not a compressible NDD instruction and eflags is
- // dead.
- unsigned NFOpc = (ST.hasNF() && !IsRedundantNDD &&
- MI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr))
- ? X86::getNFVariant(Opc)
- : 0U;
- if (IsNDLike && !IsRedundantNDD && !NFOpc)
- return false;
+ bool IsRedundantNDD = IsNDLike ? IsRedundantNewDataDest(Opc) : false;
- unsigned NewOpc = NFOpc;
- if (!NewOpc) {
+ auto GetCompressedOpc = [&](unsigned Opc) -> unsigned {
ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable);
-
- Opc = MI.getOpcode();
const auto I = llvm::lower_bound(Table, Opc);
- if (I == Table.end() || I->OldOpc != Opc) {
- assert(!IsNDLike && "Missing entry for ND-like instruction");
- return false;
- }
-
- if (!IsNDLike) {
- if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) ||
- !performCustomAdjustments(MI, I->NewOpc))
- return false;
- }
- NewOpc = I->NewOpc;
- }
+ if (I == Table.end() || I->OldOpc != Opc)
+ return 0;
+
+ if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) ||
+ !performCustomAdjustments(MI, I->NewOpc))
+ return 0;
+ return I->NewOpc;
+ };
+ // NonNF -> NF only if it's not a compressible NDD instruction and eflags is
+ // dead.
+ unsigned NewOpc = IsRedundantNDD
+ ? X86::getNonNDVariant(Opc)
+ : ((IsNDLike && ST.hasNF() &&
+ MI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr))
+ ? X86::getNFVariant(Opc)
+ : GetCompressedOpc(Opc));
+
+ if (!NewOpc)
+ return false;
const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(NewOpc);
MI.setDesc(NewDesc);
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index 7708089074c0..d50a4d3b23ae 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -653,28 +653,20 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
}
bool X86FastPreTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
+ X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
+ // Early exit in the common case of non-AMX code.
+ if (X86FI->getAMXProgModel() != AMXProgModelEnum::ManagedRA)
+ return false;
+
MF = &MFunc;
MRI = &MFunc.getRegInfo();
ST = &MFunc.getSubtarget<X86Subtarget>();
TII = ST->getInstrInfo();
- X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
MFI = &MFunc.getFrameInfo();
TRI = ST->getRegisterInfo();
CfgSS = -1;
unsigned NumVirtRegs = MRI->getNumVirtRegs();
- // Abandon early if there is no tile register to config.
- bool HasVirtTileReg = false;
- for (unsigned I = 0, E = NumVirtRegs; I != E; ++I) {
- Register VirtReg = Register::index2VirtReg(I);
- if (!MRI->reg_nodbg_empty(VirtReg) &&
- MRI->getRegClass(VirtReg)->getID() == X86::TILERegClassID) {
- HasVirtTileReg = true;
- break;
- }
- }
- if (!HasVirtTileReg)
- return false;
StackSlotForVirtReg.resize(NumVirtRegs);
MayLiveAcrossBlocks.clear();
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 2a20cd13791d..70bc11228be6 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -161,19 +161,20 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
}
}
- if (Change)
- X86FI->setHasVirtualTileReg(true);
-
return Change;
}
bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
+ X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
+ // Early exit in the common case of non-AMX code.
+ if (X86FI->getAMXProgModel() != AMXProgModelEnum::ManagedRA)
+ return false;
+
MF = &MFunc;
MRI = &MFunc.getRegInfo();
const TargetSubtargetInfo *ST = &MFunc.getSubtarget<X86Subtarget>();
TRI = ST->getRegisterInfo();
TII = MFunc.getSubtarget().getInstrInfo();
- X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
bool Change = false;
// Loop over all of the basic blocks, eliminating virtual register references
diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 5c7105988070..2de89947c451 100644
--- a/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -1,4 +1,4 @@
-//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//
+//===- X86FixupSetCC.cpp - fix zero-extension of setcc patterns -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -17,6 +17,11 @@
// performed by the setcc. Instead, we can use:
// xor %eax, %eax; seta %al
// This both avoids the stall, and encodes shorter.
+//
+// Furthurmore, we can use:
+// setzua %al
+// if feature zero-upper is available. It's faster than the xor+setcc sequence.
+// When r16-r31 is used, it even encodes shorter.
//===----------------------------------------------------------------------===//
#include "X86.h"
@@ -46,6 +51,7 @@ public:
private:
MachineRegisterInfo *MRI = nullptr;
+ const X86Subtarget *ST = nullptr;
const X86InstrInfo *TII = nullptr;
enum { SearchBound = 16 };
@@ -61,7 +67,8 @@ FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
MRI = &MF.getRegInfo();
- TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ ST = &MF.getSubtarget<X86Subtarget>();
+ TII = ST->getInstrInfo();
SmallVector<MachineInstr*, 4> ToErase;
@@ -79,7 +86,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
continue;
MachineInstr *ZExt = nullptr;
- for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))
+ Register Reg0 = MI.getOperand(0).getReg();
+ for (auto &Use : MRI->use_instructions(Reg0))
if (Use.getOpcode() == X86::MOVZX32rr8)
ZExt = &Use;
@@ -98,9 +106,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
continue;
// On 32-bit, we need to be careful to force an ABCD register.
- const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
- ? &X86::GR32RegClass
- : &X86::GR32_ABCDRegClass;
+ const TargetRegisterClass *RC =
+ ST->is64Bit() ? &X86::GR32RegClass : &X86::GR32_ABCDRegClass;
if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) {
// If we cannot constrain the register, we would need an additional copy
// and are better off keeping the MOVZX32rr8 we have now.
@@ -110,17 +117,24 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
++NumSubstZexts;
Changed = true;
- // Initialize a register with 0. This must go before the eflags def
+ // X86 setcc/setzucc only takes an output GR8, so fake a GR32 input by
+ // inserting the setcc/setzucc result into the low byte of the zeroed
+ // register.
Register ZeroReg = MRI->createVirtualRegister(RC);
- BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
- ZeroReg);
+ if (ST->hasZU()) {
+ MI.setDesc(TII->get(X86::SETZUCCr));
+ BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF), ZeroReg);
+ } else {
+ // Initialize a register with 0. This must go before the eflags def
+ BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
+ ZeroReg);
+ }
- // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
- // the setcc result into the low byte of the zeroed register.
BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg())
.addReg(ZeroReg)
- .addReg(MI.getOperand(0).getReg())
+ .addReg(Reg0)
.addImm(X86::sub_8bit);
ToErase.push_back(ZExt);
}
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index d6d077363f6f..394947bc65c8 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -24,6 +24,7 @@
#include "X86InstrBuilder.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
+#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
@@ -127,7 +128,7 @@ FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
char X86FlagsCopyLoweringPass::ID = 0;
void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -257,7 +258,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TII = Subtarget->getInstrInfo();
TRI = Subtarget->getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
PromoteRC = &X86::GR8RegClass;
if (MF.empty())
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 4521401d8741..76bcf875f00e 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2594,7 +2594,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
// Emit tilerelease for AMX kernel.
- if (X86FI->hasVirtualTileReg())
+ if (X86FI->getAMXProgModel() == AMXProgModelEnum::ManagedRA)
BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2aec14e93d08..3bbf009a1def 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2500,6 +2500,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(Op, MVT::f32, Promote);
// clang-format on
+ // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
+ // it, but it's just a wrapper around ldexp.
+ if (Subtarget.isOSWindows()) {
+ for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
+ if (isOperationExpand(Op, MVT::f32))
+ setOperationAction(Op, MVT::f32, Promote);
+ }
+
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
ISD::SCALAR_TO_VECTOR,
@@ -2516,6 +2524,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::SRL,
ISD::OR,
ISD::AND,
+ ISD::AVGCEILS,
+ ISD::AVGCEILU,
+ ISD::AVGFLOORS,
+ ISD::AVGFLOORU,
ISD::BITREVERSE,
ISD::ADD,
ISD::FADD,
@@ -4179,8 +4191,7 @@ static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) {
// Make sure we only try to split 256/512-bit types to avoid creating
// narrow vectors.
- EVT VT = Op.getValueType();
- (void)VT;
+ [[maybe_unused]] EVT VT = Op.getValueType();
assert((Op.getOperand(0).getValueType().is256BitVector() ||
Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
@@ -4195,8 +4206,7 @@ static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG,
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) {
// Assert that all the types match.
- EVT VT = Op.getValueType();
- (void)VT;
+ [[maybe_unused]] EVT VT = Op.getValueType();
assert(Op.getOperand(0).getValueType() == VT &&
Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
@@ -7326,9 +7336,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
unsigned SeqLen = Sequence.size();
bool UpperZeroOrUndef =
SeqLen == 1 ||
- llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
- return !V || V.isUndef() || isNullConstant(V);
- });
+ llvm::all_of(ArrayRef(Sequence).drop_front(),
+ [](SDValue V) { return !V || isNullConstantOrUndef(V); });
SDValue Op0 = Sequence[0];
if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
(Op0.getOpcode() == ISD::ZERO_EXTEND &&
@@ -13350,11 +13359,11 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
SmallVector<int, 4> LoInputs;
copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
array_pod_sort(LoInputs.begin(), LoInputs.end());
- LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+ LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
SmallVector<int, 4> HiInputs;
copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
array_pod_sort(HiInputs.begin(), HiInputs.end());
- HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+ HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
int NumHToL = LoInputs.size() - NumLToL;
int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
@@ -14242,13 +14251,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
copy_if(Mask, std::back_inserter(LoInputs),
[](int M) { return M >= 0 && M < 8; });
array_pod_sort(LoInputs.begin(), LoInputs.end());
- LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
- LoInputs.end());
+ LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
SmallVector<int, 4> HiInputs;
copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
array_pod_sort(HiInputs.begin(), HiInputs.end());
- HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
- HiInputs.end());
+ HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
bool TargetLo = LoInputs.size() >= HiInputs.size();
ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
@@ -23410,8 +23417,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
assert(!IsStrict && "Strict SETCC only handles FP operands.");
- MVT VTOp0 = Op0.getSimpleValueType();
- (void)VTOp0;
+ [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
assert(VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!");
assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
@@ -23816,6 +23822,20 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
}
}
+ // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
+ // overflow.
+ if (isMinSignedConstant(Op1)) {
+ EVT VT = Op0.getValueType();
+ if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
+ SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
+ X86::CondCode CondCode = CC == ISD::SETEQ ? X86::COND_O : X86::COND_NO;
+ X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
+ DAG.getConstant(0, dl, VT), Op0);
+ return SDValue(Neg.getNode(), 1);
+ }
+ }
+
// Try to use the carry flag from the add in place of an separate CMP for:
// (seteq (add X, -1), -1). Similar for setne.
if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
@@ -28476,6 +28496,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
// vector pairs, multiply and truncate.
if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumEltsPerLane = NumElts / NumLanes;
if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
@@ -28489,6 +28511,33 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
+ // Don't do this if we only need to unpack one half.
+ if (Subtarget.hasSSSE3()) {
+ bool BIsBuildVector = isa<BuildVectorSDNode>(B);
+ bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
+ bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
+ if (BIsBuildVector) {
+ for (auto [Idx, Val] : enumerate(B->ops())) {
+ if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
+ IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+ else
+ IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+ }
+ }
+ if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
+ SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
+ SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
+ SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
+ SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
+ SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
+ RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
+ RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
+ DAG.getTargetConstant(8, dl, MVT::i8));
+ return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
+ }
+ }
+
// Extract the lo/hi parts to any extend to i16.
// We're going to mask off the low byte of each result element of the
// pmullw, so it doesn't matter what's in the high byte of each 16-bit
@@ -30508,7 +30557,7 @@ static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
bool Not = false;
// Check if we have a NOT
Value *PeekI;
- if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
+ if (match(I, m_Not(m_Value(PeekI))) ||
match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
Not = true;
I = dyn_cast<Instruction>(PeekI);
@@ -32266,6 +32315,54 @@ bool X86TargetLowering::isInlineAsmTargetBranch(
return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
}
+static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Mask) {
+ EVT Ty = MVT::i8;
+ auto V = DAG.getBitcast(MVT::i1, Mask);
+ auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
+ auto Zero = DAG.getConstant(0, DL, Ty);
+ SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
+ auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
+ return SDValue(CmpZero.getNode(), 1);
+}
+
+SDValue X86TargetLowering::visitMaskedLoad(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
+ SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
+ // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
+ // ->
+ // _, flags = SUB 0, mask
+ // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
+ // bit_cast_to_vector<res>
+ EVT VTy = PassThru.getValueType();
+ EVT Ty = VTy.getVectorElementType();
+ SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
+ auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
+ : DAG.getBitcast(Ty, PassThru);
+ auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
+ auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
+ NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
+ return DAG.getBitcast(VTy, NewLoad);
+}
+
+SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain,
+ MachineMemOperand *MMO, SDValue Ptr,
+ SDValue Val, SDValue Mask) const {
+ // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
+ // ->
+ // _, flags = SUB 0, mask
+ // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
+ EVT Ty = Val.getValueType().getVectorElementType();
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ auto ScalarVal = DAG.getBitcast(Ty, Val);
+ auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
+ auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
+ return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
+}
+
/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -33982,6 +34079,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(STRICT_FP80_ADD)
NODE_NAME_CASE(CCMP)
NODE_NAME_CASE(CTEST)
+ NODE_NAME_CASE(CLOAD)
+ NODE_NAME_CASE(CSTORE)
}
return nullptr;
#undef NODE_NAME_CASE
@@ -37040,6 +37139,52 @@ static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS,
Known = Known.zext(64);
}
+static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) {
+ unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
+
+ // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
+ APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+ APInt DemandedLoElts =
+ DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
+ APInt DemandedHiElts =
+ DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
+ KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
+ KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
+ KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
+ KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
+ KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
+ KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
+ Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false,
+ /*NUW=*/false, Lo, Hi);
+}
+
+static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) {
+ unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
+
+ // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
+ // pairs.
+ APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+ APInt DemandedLoElts =
+ DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
+ APInt DemandedHiElts =
+ DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
+ KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
+ KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
+ KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
+ KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
+ KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
+ KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
+ Known = KnownBits::sadd_sat(Lo, Hi);
+}
+
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
@@ -37215,6 +37360,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
+ case X86ISD::VPMADDWD: {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ assert(VT.getVectorElementType() == MVT::i32 &&
+ LHS.getValueType() == RHS.getValueType() &&
+ LHS.getValueType().getVectorElementType() == MVT::i16 &&
+ "Unexpected PMADDWD types");
+ computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
+ break;
+ }
+ case X86ISD::VPMADDUBSW: {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ assert(VT.getVectorElementType() == MVT::i16 &&
+ LHS.getValueType() == RHS.getValueType() &&
+ LHS.getValueType().getVectorElementType() == MVT::i8 &&
+ "Unexpected PMADDUBSW types");
+ computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
+ break;
+ }
case X86ISD::PMULUDQ: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -37351,6 +37516,30 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
case ISD::INTRINSIC_WO_CHAIN: {
switch (Op->getConstantOperandVal(0)) {
+ case Intrinsic::x86_sse2_pmadd_wd:
+ case Intrinsic::x86_avx2_pmadd_wd:
+ case Intrinsic::x86_avx512_pmaddw_d_512: {
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ assert(VT.getScalarType() == MVT::i32 &&
+ LHS.getValueType() == RHS.getValueType() &&
+ LHS.getValueType().getScalarType() == MVT::i16 &&
+ "Unexpected PMADDWD types");
+ computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
+ break;
+ }
+ case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+ case Intrinsic::x86_avx2_pmadd_ub_sw:
+ case Intrinsic::x86_avx512_pmaddubs_w_512: {
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ assert(VT.getScalarType() == MVT::i16 &&
+ LHS.getValueType() == RHS.getValueType() &&
+ LHS.getValueType().getScalarType() == MVT::i8 &&
+ "Unexpected PMADDUBSW types");
+ computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
+ break;
+ }
case Intrinsic::x86_sse2_psad_bw:
case Intrinsic::x86_avx2_psad_bw:
case Intrinsic::x86_avx512_psad_bw_512: {
@@ -41635,6 +41824,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
KnownZero = LHSZero | RHSZero;
break;
}
+ case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD: {
APInt LHSUndef, LHSZero;
APInt RHSUndef, RHSZero;
@@ -41917,7 +42107,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::CVTPH2PS:
case X86ISD::CVTPS2PH: {
SDValue Src = Op.getOperand(0);
- MVT SrcVT = Src.getSimpleValueType();
+ EVT SrcVT = Src.getValueType();
APInt SrcUndef, SrcZero;
APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
@@ -42793,6 +42983,19 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
AssumeSingleUse);
}
+ case X86ISD::CMOV: {
+ KnownBits Known2;
+ if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
+ OriginalDemandedElts, Known2, TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
+ OriginalDemandedElts, Known, TLO, Depth + 1))
+ return true;
+
+ // Only known if known in both the LHS and RHS.
+ Known = Known.intersectWith(Known2);
+ break;
+ }
case X86ISD::BEXTR:
case X86ISD::BEXTRI: {
SDValue Op0 = Op.getOperand(0);
@@ -43710,7 +43913,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
SDValue Op = N0.getOperand(i);
LowUndef &= Op.isUndef() || (i >= e/2);
- AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
+ AllUndefOrZero &= isNullConstantOrUndef(Op);
}
if (AllUndefOrZero) {
SDValue N00 = N0.getOperand(0);
@@ -50184,12 +50387,12 @@ static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
/// If this is an add or subtract where one operand is produced by a cmp+setcc,
/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
/// with CMP+{ADC, SBB}.
-static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG) {
bool IsSub = N->getOpcode() == ISD::SUB;
SDValue X = N->getOperand(0);
SDValue Y = N->getOperand(1);
EVT VT = N->getValueType(0);
- SDLoc DL(N);
if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
return ADCOrSBB;
@@ -50708,157 +50911,6 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
return SDValue();
}
-/// This function detects the AVG pattern between vectors of unsigned i8/i16,
-/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
-/// ISD::AVGCEILU (AVG) instruction.
-static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- const SDLoc &DL) {
- if (!VT.isVector())
- return SDValue();
- EVT InVT = In.getValueType();
- unsigned NumElems = VT.getVectorNumElements();
-
- EVT ScalarVT = VT.getVectorElementType();
- if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
- return SDValue();
-
- // InScalarVT is the intermediate type in AVG pattern and it should be greater
- // than the original input type (i8/i16).
- EVT InScalarVT = InVT.getVectorElementType();
- if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
- return SDValue();
-
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- // Detect the following pattern:
- //
- // %1 = zext <N x i8> %a to <N x i32>
- // %2 = zext <N x i8> %b to <N x i32>
- // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
- // %4 = add nuw nsw <N x i32> %3, %2
- // %5 = lshr <N x i32> %N, <i32 1 x N>
- // %6 = trunc <N x i32> %5 to <N x i8>
- //
- // In AVX512, the last instruction can also be a trunc store.
- if (In.getOpcode() != ISD::SRL)
- return SDValue();
-
- // A lambda checking the given SDValue is a constant vector and each element
- // is in the range [Min, Max].
- auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
- return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
- return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
- });
- };
-
- auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
- unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
- return MaxActiveBits <= ScalarVT.getSizeInBits();
- };
-
- // Check if each element of the vector is right-shifted by one.
- SDValue LHS = In.getOperand(0);
- SDValue RHS = In.getOperand(1);
- if (!IsConstVectorInRange(RHS, 1, 1))
- return SDValue();
- if (LHS.getOpcode() != ISD::ADD)
- return SDValue();
-
- // Detect a pattern of a + b + 1 where the order doesn't matter.
- SDValue Operands[3];
- Operands[0] = LHS.getOperand(0);
- Operands[1] = LHS.getOperand(1);
-
- auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
- };
-
- auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
- for (SDValue &Op : Ops)
- if (Op.getValueType() != VT)
- Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
- // Pad to a power-of-2 vector, split+apply and extract the original vector.
- unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
- EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
- if (NumElemsPow2 != NumElems) {
- for (SDValue &Op : Ops) {
- SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
- for (unsigned i = 0; i != NumElems; ++i) {
- SDValue Idx = DAG.getIntPtrConstant(i, DL);
- EltsOfOp[i] =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
- }
- Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
- }
- }
- SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
- if (NumElemsPow2 == NumElems)
- return Res;
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
- DAG.getIntPtrConstant(0, DL));
- };
-
- // Take care of the case when one of the operands is a constant vector whose
- // element is in the range [1, 256].
- if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
- IsZExtLike(Operands[0])) {
- // The pattern is detected. Subtract one from the constant vector, then
- // demote it and emit X86ISD::AVG instruction.
- SDValue VecOnes = DAG.getConstant(1, DL, InVT);
- Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
- return AVGSplitter({Operands[0], Operands[1]});
- }
-
- // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
- // Match the or case only if its 'add-like' - can be replaced by an add.
- auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
- if (ISD::ADD == V.getOpcode()) {
- Op0 = V.getOperand(0);
- Op1 = V.getOperand(1);
- return true;
- }
- if (ISD::ZERO_EXTEND != V.getOpcode())
- return false;
- V = V.getOperand(0);
- if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
- !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
- return false;
- Op0 = V.getOperand(0);
- Op1 = V.getOperand(1);
- return true;
- };
-
- SDValue Op0, Op1;
- if (FindAddLike(Operands[0], Op0, Op1))
- std::swap(Operands[0], Operands[1]);
- else if (!FindAddLike(Operands[1], Op0, Op1))
- return SDValue();
- Operands[2] = Op0;
- Operands[1] = Op1;
-
- // Now we have three operands of two additions. Check that one of them is a
- // constant vector with ones, and the other two can be promoted from i8/i16.
- for (SDValue &Op : Operands) {
- if (!IsConstVectorInRange(Op, 1, 1))
- continue;
- std::swap(Op, Operands[2]);
-
- // Check if Operands[0] and Operands[1] are results of type promotion.
- for (int j = 0; j < 2; ++j)
- if (Operands[j].getValueType() != VT)
- if (!IsZExtLike(Operands[j]))
- return SDValue();
-
- // The pattern is detected, emit X86ISD::AVG instruction(s).
- return AVGSplitter({Operands[0], Operands[1]});
- }
-
- return SDValue();
-}
-
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl,
SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -50875,6 +50927,10 @@ static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl,
if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
return SDValue();
+ const Constant *LdC = getTargetConstantFromBasePtr(Ptr);
+ if (!LdC)
+ return SDValue();
+
auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
@@ -50899,12 +50955,11 @@ static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl,
RegVT.getFixedSizeInBits()) {
EVT UserVT = User->getValueType(0);
SDValue UserPtr = UserLd->getBasePtr();
- const Constant *LdC = getTargetConstantFromBasePtr(Ptr);
const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
// See if we are loading a constant that matches in the lower
// bits of a longer constant (but from a different constant pool ptr).
- if (LdC && UserC && UserPtr != Ptr) {
+ if (UserC && UserPtr != Ptr) {
unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
@@ -51498,16 +51553,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
if (St->isTruncatingStore() && VT.isVector()) {
- // Check if we can detect an AVG pattern from the truncation. If yes,
- // replace the trunc store by a normal store with the result of X86ISD::AVG
- // instruction.
- if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
- if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
- Subtarget, dl))
- return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
- St->getPointerInfo(), St->getOriginalAlign(),
- St->getMemOperand()->getFlags());
-
if (TLI.isTruncStoreLegal(VT, StVT)) {
if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
@@ -52363,10 +52408,6 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
return V;
- // Try to detect AVG pattern first.
- if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
- return Avg;
-
// Try to detect PMADD
if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
return PMAdd;
@@ -52718,7 +52759,7 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
}
-static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
+static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
"Invalid opcode for combing with CTLZ");
@@ -52758,7 +52799,6 @@ static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
return SDValue();
- SDLoc DL(N);
EVT OpVT = VT;
SDValue Op = OpCTLZ.getOperand(0);
if (VT == MVT::i8) {
@@ -52781,11 +52821,12 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
+ SDLoc DL(N);
// If this is SSE1 only convert to FXOR to avoid scalarization.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
return DAG.getBitcast(MVT::v4i32,
- DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
+ DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
DAG.getBitcast(MVT::v4f32, N0),
DAG.getBitcast(MVT::v4f32, N1)));
}
@@ -52805,7 +52846,7 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
return FPLogic;
- if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
+ if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
return R;
if (DCI.isBeforeLegalizeOps())
@@ -52826,8 +52867,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
N0.getOperand(0).getValueType().isVector() &&
N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
- return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
- N0.getOperand(0).getValueType()));
+ return DAG.getBitcast(
+ VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
}
// Handle AVX512 mask widening.
@@ -52837,8 +52878,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
return DAG.getNode(
- ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
- DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
+ ISD::INSERT_SUBVECTOR, DL, VT, N0.getOperand(0),
+ DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
N0.getOperand(2));
}
@@ -52851,7 +52892,6 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
auto *N1C = dyn_cast<ConstantSDNode>(N1);
auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
- SDLoc DL(N);
SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
return DAG.getNode(ISD::XOR, DL, VT, LHS,
@@ -52892,6 +52932,31 @@ static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Various combines to try to convert to avgceilu.
+static SDValue combineAVG(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ SDLoc DL(N);
+
+ // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
+ // Only useful on vXi8 which doesn't have good SRA handling.
+ if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
+ APInt SignBit = APInt::getSignMask(VT.getScalarSizeInBits());
+ SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
+ N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
+ N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
+ return DAG.getNode(ISD::XOR, DL, VT,
+ DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
+ }
+
+ return SDValue();
+}
+
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -55419,7 +55484,8 @@ static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
/// Try to fold those constants into an 'add' instruction to reduce instruction
/// count. We do this with CMOV rather the generic 'select' because there are
/// earlier folds that may be used to turn select-of-constants into logic hacks.
-static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
+static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// If an operand is zero, add-of-0 gets simplified away, so that's clearly
// better because we eliminate 1-2 instructions. This transform is still
@@ -55451,7 +55517,6 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = N->getValueType(0);
- SDLoc DL(N);
SDValue FalseOp = Cmov.getOperand(0);
SDValue TrueOp = Cmov.getOperand(1);
@@ -55492,7 +55557,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
SDValue Op1 = N->getOperand(1);
SDLoc DL(N);
- if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
+ if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
return Select;
if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
@@ -55550,7 +55615,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
Op0.getOperand(0), Op0.getOperand(2));
}
- return combineAddOrSubToADCOrSBB(N, DAG);
+ return combineAddOrSubToADCOrSBB(N, DL, DAG);
}
// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
@@ -55621,11 +55686,38 @@ static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) {
+ // res, flags2 = sub 0, (setcc cc, flag)
+ // cload/cstore ..., cond_ne, flag2
+ // ->
+ // cload/cstore cc, flag
+ if (N->getConstantOperandVal(3) != X86::COND_NE)
+ return SDValue();
+
+ SDValue Sub = N->getOperand(4);
+ if (Sub.getOpcode() != X86ISD::SUB)
+ return SDValue();
+
+ SDValue SetCC = Sub.getOperand(1);
+
+ if (!X86::isZeroNode(Sub.getOperand(0)) || SetCC.getOpcode() != X86ISD::SETCC)
+ return SDValue();
+
+ SmallVector<SDValue, 5> Ops(N->op_values());
+ Ops[3] = SetCC.getOperand(0);
+ Ops[4] = SetCC.getOperand(1);
+
+ return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
+ cast<MemSDNode>(N)->getMemoryVT(),
+ cast<MemSDNode>(N)->getMemOperand());
+}
+
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
+ SDLoc DL(N);
// TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
auto IsNonOpaqueConstant = [&](SDValue Op) {
@@ -55645,7 +55737,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
!isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
Op1->hasOneUse()) {
- SDLoc DL(N);
EVT VT = Op0.getValueType();
SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
@@ -55676,14 +55767,14 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
Op1.getOperand(1), Op1.getOperand(2));
- return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
+ return DAG.getNode(ISD::SUB, DL, Op0.getValueType(), ADC.getValue(0),
Op1.getOperand(0));
}
- if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
+ if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
return V;
- if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
+ if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
return V;
return combineSubSetcc(N, DAG);
@@ -55917,6 +56008,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
break;
case X86ISD::PSHUFB:
case X86ISD::PSADBW:
+ case X86ISD::VPMADDUBSW:
+ case X86ISD::VPMADDWD:
if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
(VT.is512BitVector() && Subtarget.useBWIRegs()))) {
MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
@@ -57328,6 +57421,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
case X86ISD::ADD:
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
+ case X86ISD::CLOAD:
+ case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
case X86ISD::SBB: return combineSBB(N, DAG);
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
@@ -57338,6 +57433,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
+ case ISD::AVGCEILS:
+ case ISD::AVGCEILU:
+ case ISD::AVGFLOORS:
+ case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
case X86ISD::BEXTR:
case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
@@ -57529,16 +57628,20 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
+ case ISD::MUL:
+ return false;
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::SUB:
case ISD::ADD:
- case ISD::MUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
- return false;
+ // NDD instruction never has "partial register write" issue b/c it has
+ // destination register's upper bits [63:OSIZE]) zeroed even when
+ // OSIZE=8/16.
+ return Subtarget.hasNDD();
}
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3c5c903bc0d9..362daa98e1f8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -903,6 +903,10 @@ namespace llvm {
// is needed so that this can be expanded with control flow.
VASTART_SAVE_XMM_REGS,
+ // Conditional load/store instructions
+ CLOAD,
+ CSTORE,
+
// WARNING: Do not add anything in the end unless you want the node to
// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
// opcodes will be thought as target memory ops!
@@ -1556,6 +1560,14 @@ namespace llvm {
bool isInlineAsmTargetBranch(const SmallVectorImpl<StringRef> &AsmStrs,
unsigned OpNo) const override;
+ SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ MachineMemOperand *MMO, SDValue &NewLoad,
+ SDValue Ptr, SDValue PassThru,
+ SDValue Mask) const override;
+ SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ MachineMemOperand *MMO, SDValue Ptr, SDValue Val,
+ SDValue Mask) const override;
+
/// Lower interleaved load(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedLoad(LoadInst *LI,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b107d56f8cf9..8be64b6fff3f 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -417,7 +417,8 @@ unsigned X86TargetLowering::getJumpTableEncoding() const {
if (isPositionIndependent() && Subtarget.isPICStyleGOT())
return MachineJumpTableInfo::EK_Custom32;
if (isPositionIndependent() &&
- getTargetMachine().getCodeModel() == CodeModel::Large)
+ getTargetMachine().getCodeModel() == CodeModel::Large &&
+ !Subtarget.isTargetCOFF())
return MachineJumpTableInfo::EK_LabelDifference64;
// Otherwise, use the normal jump table encoding heuristics.
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 8e75e185f0f6..8cf502d820e9 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2701,14 +2701,14 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
BoolVec->getType()->isVectorTy() &&
BoolVec->getType()->getScalarSizeInBits() == 1) {
- assert(Mask->getType()->getPrimitiveSizeInBits() ==
- II.getType()->getPrimitiveSizeInBits() &&
+ auto *MaskTy = cast<FixedVectorType>(Mask->getType());
+ auto *OpTy = cast<FixedVectorType>(II.getType());
+ assert(MaskTy->getPrimitiveSizeInBits() ==
+ OpTy->getPrimitiveSizeInBits() &&
"Not expecting mask and operands with different sizes");
+ unsigned NumMaskElts = MaskTy->getNumElements();
+ unsigned NumOperandElts = OpTy->getNumElements();
- unsigned NumMaskElts =
- cast<FixedVectorType>(Mask->getType())->getNumElements();
- unsigned NumOperandElts =
- cast<FixedVectorType>(II.getType())->getNumElements();
if (NumMaskElts == NumOperandElts) {
return SelectInst::Create(BoolVec, Op1, Op0);
}
@@ -2716,8 +2716,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
// If the mask has less elements than the operands, each mask bit maps to
// multiple elements of the operands. Bitcast back and forth.
if (NumMaskElts < NumOperandElts) {
- Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
- Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
+ Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
+ Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
return new BitCastInst(Sel, II.getType());
}
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index c45ec8981ab1..ffa8a105e2d1 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -14,34 +14,34 @@
//===----------------------------------------------------------------------===//
// LEA - Load Effective Address
let SchedRW = [WriteLEA] in {
-let hasSideEffects = 0 in
-def LEA16r : I<0x8D, MRMSrcMem,
- (outs GR16:$dst), (ins anymem:$src),
- "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize16;
-let isReMaterializable = 1 in
-def LEA32r : I<0x8D, MRMSrcMem,
- (outs GR32:$dst), (ins anymem:$src),
- "lea{l}\t{$src|$dst}, {$dst|$src}",
- [(set GR32:$dst, lea32addr:$src)]>,
- OpSize32, Requires<[Not64BitMode]>;
-
-def LEA64_32r : I<0x8D, MRMSrcMem,
- (outs GR32:$dst), (ins lea64_32mem:$src),
- "lea{l}\t{$src|$dst}, {$dst|$src}",
- [(set GR32:$dst, lea64_32addr:$src)]>,
- OpSize32, Requires<[In64BitMode]>;
-
-let isReMaterializable = 1 in
-def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
- "lea{q}\t{$src|$dst}, {$dst|$src}",
- [(set GR64:$dst, lea64addr:$src)]>;
+ let hasSideEffects = 0 in
+ def LEA16r : I<0x8D, MRMSrcMem,
+ (outs GR16:$dst), (ins anymem:$src),
+ "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize16;
+ let isReMaterializable = 1 in
+ def LEA32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins anymem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)]>,
+ OpSize32, Requires<[Not64BitMode]>;
+
+ def LEA64_32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins lea64_32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea64_32addr:$src)]>,
+ OpSize32, Requires<[In64BitMode]>;
+
+ let isReMaterializable = 1 in
+ def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+ "lea{q}\t{$src|$dst}, {$dst|$src}",
+ [(set GR64:$dst, lea64addr:$src)]>;
} // SchedRW
// Pseudo instruction for lea that prevent optimizer from eliminating
// the instruction.
let SchedRW = [WriteLEA], isPseudo = true, hasSideEffects = 1 in {
-def PLEA32r : PseudoI<(outs GR32:$dst), (ins anymem:$src), []>;
-def PLEA64r : PseudoI<(outs GR64:$dst), (ins anymem:$src), []>;
+ def PLEA32r : PseudoI<(outs GR32:$dst), (ins anymem:$src), []>;
+ def PLEA64r : PseudoI<(outs GR64:$dst), (ins anymem:$src), []>;
}
//===----------------------------------------------------------------------===//
@@ -655,111 +655,112 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
}
}
- def 8rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>;
- def 16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
- def 32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
- def 64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;
- let Predicates = [In64BitMode] in {
- def 8rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
- def 16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
- def 32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
- def 64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
- def 8rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
- def 16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
- def 32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
- def 64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
- def 8rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF;
- def 16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD;
- def 32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF;
- def 64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF;
- def 8rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
- def 16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
- def 32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
- def 64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
- }
+ def 8rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>;
+ def 16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+ def 32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+ def 64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;
+ let Predicates = [In64BitMode] in {
+ def 8rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
+ def 16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
+ def 32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
+ def 64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
+ def 8rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
+ def 16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
+ def 32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
+ def 64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
+ def 8rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF;
+ def 16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD;
+ def 32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF;
+ def 64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF;
+ def 8rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
+ def 16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
+ def 32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
+ def 64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
+ }
+
+ let Predicates = [NoNDD] in {
+ def 8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+ def 16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16;
+ def 32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32;
+ def 64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+ }
+ let Predicates = [HasNDD, In64BitMode] in {
+ def 8rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>;
+ def 16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD;
+ def 32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>;
+ def 64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>;
+ def 8rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
+ def 16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
+ def 32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
+ def 64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
+ }
+ let Predicates = [In64BitMode] in {
+ def 8rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF;
+ def 16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD;
+ def 32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF;
+ def 64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF;
+ def 8rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL;
+ def 16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD;
+ def 32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL;
+ def 64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL;
+ }
+ let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
let Predicates = [NoNDD] in {
- def 8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
- def 16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16;
- def 32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32;
- def 64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def 16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+ def 32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+ def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
+ def 8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+ def 16ri : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16;
+ def 32ri : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32;
+ def 64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
}
let Predicates = [HasNDD, In64BitMode] in {
- def 8rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>;
- def 16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD;
- def 32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>;
- def 64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>;
- def 8rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
- def 16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
- def 32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
- def 64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
+ def 16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
+ def 32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
+ def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
+ def 8ri_ND : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>;
+ def 16ri_ND : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD;
+ def 32ri_ND : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>;
+ def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>;
+ def 16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
+ def 32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
+ def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
+ def 8ri_NF_ND : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF;
+ def 16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
+ def 32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
+ def 64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
}
let Predicates = [In64BitMode] in {
- def 8rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF;
- def 16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD;
- def 32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF;
- def 64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF;
- def 8rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL;
- def 16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD;
- def 32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL;
- def 64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL;
- }
-
- let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- let Predicates = [NoNDD] in {
- // NOTE: These are order specific, we want the ri8 forms to be listed
- // first so that they are slightly preferred to the ri forms.
- def 16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
- def 32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
- def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
- def 8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
- def 16ri : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16;
- def 32ri : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32;
- def 64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
- }
- let Predicates = [HasNDD, In64BitMode] in {
- def 16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
- def 32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
- def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
- def 8ri_ND : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>;
- def 16ri_ND : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD;
- def 32ri_ND : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>;
- def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>;
- def 16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
- def 32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
- def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
- def 8ri_NF_ND : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF;
- def 16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
- def 32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
- def 64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
- }
- let Predicates = [In64BitMode] in {
- def 16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD;
- def 32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF;
- def 64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF;
- def 8ri_NF : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF;
- def 16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD;
- def 32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF;
- def 64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF;
- def 16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
- def 32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
- def 64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
- def 8ri_EVEX : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL;
- def 16ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD;
- def 32ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL;
- def 64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL;
- }
+ def 16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD;
+ def 32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF;
+ def 64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF;
+ def 8ri_NF : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF;
+ def 16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD;
+ def 32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF;
+ def 64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF;
+ def 16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
+ def 32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
+ def 64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
+ def 8ri_EVEX : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL;
+ def 16ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD;
+ def 32ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL;
+ def 64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL;
}
+ }
- def 8mr : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>;
- def 16mr : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
- def 32mr : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
- def 64mr : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;
- let Predicates = [HasNDD, In64BitMode] in {
- def 8mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>;
- def 16mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
- def 32mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>;
- def 64mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>;
+ def 8mr : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def 16mr : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+ def 32mr : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+ def 64mr : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;
+ let Predicates = [HasNDD, In64BitMode] in {
+ defvar node = !if(!eq(CommutableRR, 0), opnode, null_frag);
+ def 8mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , node>;
+ def 16mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi16, node>, PD;
+ def 32mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi32, node>;
+ def 64mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi64, node>;
def 8mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF;
def 16mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD;
def 32mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF;
@@ -823,18 +824,14 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
// These are for the disassembler since 0x82 opcode behaves like 0x80, but
// not in 64-bit mode.
let Predicates = [Not64BitMode] in {
- def 8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
- def 8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+ def 8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+ def 8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
}
- def 8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|al, $src}">;
- def 16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|ax, $src}">, OpSize16;
- def 32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|eax, $src}">, OpSize32;
- def 64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|rax, $src}">;
+ def 8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL, "{$src, %al|al, $src}">;
+ def 16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16;
+ def 32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32;
+ def 64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX, "{$src, %rax|rax, $src}">;
}
/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
@@ -948,10 +945,11 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
def 32mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
def 64mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>;
let Predicates = [HasNDD, In64BitMode] in {
- def 8mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
- def 16mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
- def 32mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>;
- def 64mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
+ defvar node = !if(!eq(CommutableRR, 0), opnode, null_frag);
+ def 8mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , node>;
+ def 16mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, node>, PD;
+ def 32mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, node>;
+ def 64mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, node>;
}
let Predicates = [In64BitMode] in {
def 8mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
@@ -995,17 +993,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
// not in 64-bit mode.
let Predicates = [Not64BitMode] in {
def 8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
- def 8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+ def 8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
}
- def 8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|al, $src}">;
- def 16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|ax, $src}">, OpSize16;
- def 32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|eax, $src}">, OpSize32;
- def 64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|rax, $src}">;
+ def 8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL, "{$src, %al|al, $src}">;
+ def 16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16;
+ def 32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32;
+ def 64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX, "{$src, %rax|rax, $src}">;
}
/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
@@ -1017,11 +1011,11 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
SDNode opnode, bit CommutableRR,
bit ConvertibleToThreeAddress> {
let isCommutable = CommutableRR in {
- def 8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def 8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- def 16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
- def 32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
- def 64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+ def 16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+ def 32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+ def 64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
} // isConvertibleToThreeAddress
} // isCommutable
@@ -1038,15 +1032,15 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
def 8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- // NOTE: These are order specific, we want the ri8 forms to be listed
- // first so that they are slightly preferred to the ri forms.
- def 16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
- def 32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
- def 64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>;
-
- def 16ri : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
- def 32ri : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
- def 64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;
+ // NOTE: These are order specific, we want the ri8 forms to be listed
+ // first so that they are slightly preferred to the ri forms.
+ def 16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+ def 32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+ def 64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>;
+
+ def 16ri : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
+ def 32ri : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
+ def 64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;
}
def 8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
@@ -1065,24 +1059,20 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
def 16mi : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
def 32mi : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
let Predicates = [In64BitMode] in
- def 64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;
+ def 64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;
// These are for the disassembler since 0x82 opcode behaves like 0x80, but
// not in 64-bit mode.
let Predicates = [Not64BitMode] in {
- def 8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+ def 8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
let mayLoad = 1 in
- def 8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
+ def 8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
}
- def 8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|al, $src}">;
- def 16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|ax, $src}">, OpSize16;
- def 32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|eax, $src}">, OpSize32;
- def 64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|rax, $src}">;
+ def 8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, "{$src, %al|al, $src}">;
+ def 16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16;
+ def 32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32;
+ def 64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, "{$src, %rax|rax, $src}">;
}
@@ -1095,18 +1085,18 @@ defm XOR : ArithBinOp_RF<0x31, 0x33, 0x35, "xor", MRM6r, MRM6m,
defm ADD : ArithBinOp_RF<0x01, 0x03, 0x05, "add", MRM0r, MRM0m,
X86add_flag, add, 1, 1, 1>;
let isCompare = 1 in {
-defm SUB : ArithBinOp_RF<0x29, 0x2B, 0x2D, "sub", MRM5r, MRM5m,
- X86sub_flag, sub, 0, 1, 0>;
+ defm SUB : ArithBinOp_RF<0x29, 0x2B, 0x2D, "sub", MRM5r, MRM5m,
+ X86sub_flag, sub, 0, 1, 0>;
}
// Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of
// __builtin_parity where the last step xors an h-register with an l-register.
let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst",
Defs = [EFLAGS], isCommutable = 1 in
-def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst),
- (ins GR8_NOREX:$src1, GR8_NOREX:$src2),
- "xor{b}\t{$src2, $dst|$dst, $src2}", []>,
- Sched<[WriteALU]>;
+ def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst),
+ (ins GR8_NOREX:$src1, GR8_NOREX:$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}", []>,
+ Sched<[WriteALU]>;
// Arithmetic.
defm ADC : ArithBinOp_RFF<0x11, 0x13, 0x15, "adc", MRM2r, MRM2m, X86adc_flag,
@@ -1115,19 +1105,31 @@ defm SBB : ArithBinOp_RFF<0x19, 0x1B, 0x1D, "sbb", MRM3r, MRM3m, X86sbb_flag,
0, 0>;
let isCompare = 1 in {
-defm CMP : ArithBinOp_F<0x39, 0x3B, 0x3D, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+ defm CMP : ArithBinOp_F<0x39, 0x3B, 0x3D, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
}
// Patterns to recognize loads on the LHS of an ADC. We can't make X86adc_flag
// commutable since it has EFLAGs as an input.
-def : Pat<(X86adc_flag (loadi8 addr:$src2), GR8:$src1, EFLAGS),
- (ADC8rm GR8:$src1, addr:$src2)>;
-def : Pat<(X86adc_flag (loadi16 addr:$src2), GR16:$src1, EFLAGS),
- (ADC16rm GR16:$src1, addr:$src2)>;
-def : Pat<(X86adc_flag (loadi32 addr:$src2), GR32:$src1, EFLAGS),
- (ADC32rm GR32:$src1, addr:$src2)>;
-def : Pat<(X86adc_flag (loadi64 addr:$src2), GR64:$src1, EFLAGS),
- (ADC64rm GR64:$src1, addr:$src2)>;
+let Predicates = [NoNDD] in {
+ def : Pat<(X86adc_flag (loadi8 addr:$src2), GR8:$src1, EFLAGS),
+ (ADC8rm GR8:$src1, addr:$src2)>;
+ def : Pat<(X86adc_flag (loadi16 addr:$src2), GR16:$src1, EFLAGS),
+ (ADC16rm GR16:$src1, addr:$src2)>;
+ def : Pat<(X86adc_flag (loadi32 addr:$src2), GR32:$src1, EFLAGS),
+ (ADC32rm GR32:$src1, addr:$src2)>;
+ def : Pat<(X86adc_flag (loadi64 addr:$src2), GR64:$src1, EFLAGS),
+ (ADC64rm GR64:$src1, addr:$src2)>;
+}
+let Predicates = [HasNDD] in {
+ def : Pat<(X86adc_flag (loadi8 addr:$src2), GR8:$src1, EFLAGS),
+ (ADC8rm_ND GR8:$src1, addr:$src2)>;
+ def : Pat<(X86adc_flag (loadi16 addr:$src2), GR16:$src1, EFLAGS),
+ (ADC16rm_ND GR16:$src1, addr:$src2)>;
+ def : Pat<(X86adc_flag (loadi32 addr:$src2), GR32:$src1, EFLAGS),
+ (ADC32rm_ND GR32:$src1, addr:$src2)>;
+ def : Pat<(X86adc_flag (loadi64 addr:$src2), GR64:$src1, EFLAGS),
+ (ADC64rm_ND GR64:$src1, addr:$src2)>;
+}
// Patterns to recognize RMW ADC with loads in operand 1.
def : Pat<(store (X86adc_flag GR8:$src, (loadi8 addr:$dst), EFLAGS),
@@ -1299,33 +1301,33 @@ let isCompare = 1 in {
// Avoid selecting these and instead use a test+and. Post processing will
// combine them. This gives bunch of other patterns that start with
// and a chance to match.
- def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , null_frag>;
- def TEST16rr : BinOpRR_F<0x85, "test", Xi16, null_frag>, OpSize16;
- def TEST32rr : BinOpRR_F<0x85, "test", Xi32, null_frag>, OpSize32;
- def TEST64rr : BinOpRR_F<0x85, "test", Xi64, null_frag>;
+ def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , null_frag>;
+ def TEST16rr : BinOpRR_F<0x85, "test", Xi16, null_frag>, OpSize16;
+ def TEST32rr : BinOpRR_F<0x85, "test", Xi32, null_frag>, OpSize32;
+ def TEST64rr : BinOpRR_F<0x85, "test", Xi64, null_frag>;
} // isCommutable
-def TEST8mr : BinOpMR_F<0x84, "test", Xi8 , null_frag>;
-def TEST16mr : BinOpMR_F<0x85, "test", Xi16, null_frag>, OpSize16;
-def TEST32mr : BinOpMR_F<0x85, "test", Xi32, null_frag>, OpSize32;
-def TEST64mr : BinOpMR_F<0x85, "test", Xi64, null_frag>;
+ def TEST8mr : BinOpMR_F<0x84, "test", Xi8 , null_frag>;
+ def TEST16mr : BinOpMR_F<0x85, "test", Xi16, null_frag>, OpSize16;
+ def TEST32mr : BinOpMR_F<0x85, "test", Xi32, null_frag>, OpSize32;
+ def TEST64mr : BinOpMR_F<0x85, "test", Xi64, null_frag>;
-def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
-def TEST16ri : BinOpRI_F<0xF7, "test", Xi16, X86testpat, MRM0r>, OpSize16;
-def TEST32ri : BinOpRI_F<0xF7, "test", Xi32, X86testpat, MRM0r>, OpSize32;
-def TEST64ri32 : BinOpRI_F<0xF7, "test", Xi64, X86testpat, MRM0r>;
+ def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+ def TEST16ri : BinOpRI_F<0xF7, "test", Xi16, X86testpat, MRM0r>, OpSize16;
+ def TEST32ri : BinOpRI_F<0xF7, "test", Xi32, X86testpat, MRM0r>, OpSize32;
+ def TEST64ri32 : BinOpRI_F<0xF7, "test", Xi64, X86testpat, MRM0r>;
-def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
-def TEST16mi : BinOpMI_F<0xF7, "test", Xi16, X86testpat, MRM0m>, OpSize16;
-def TEST32mi : BinOpMI_F<0xF7, "test", Xi32, X86testpat, MRM0m>, OpSize32;
+ def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
+ def TEST16mi : BinOpMI_F<0xF7, "test", Xi16, X86testpat, MRM0m>, OpSize16;
+ def TEST32mi : BinOpMI_F<0xF7, "test", Xi32, X86testpat, MRM0m>, OpSize32;
let Predicates = [In64BitMode] in
- def TEST64mi32 : BinOpMI_F<0xF7, "test", Xi64, X86testpat, MRM0m>;
+ def TEST64mi32 : BinOpMI_F<0xF7, "test", Xi64, X86testpat, MRM0m>;
-def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, "{$src, %al|al, $src}">;
-def TEST16i16 : BinOpAI_F<0xA9, "test", Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16;
-def TEST32i32 : BinOpAI_F<0xA9, "test", Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32;
-def TEST64i32 : BinOpAI_F<0xA9, "test", Xi64, RAX, "{$src, %rax|rax, $src}">;
+ def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, "{$src, %al|al, $src}">;
+ def TEST16i16 : BinOpAI_F<0xA9, "test", Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16;
+ def TEST32i32 : BinOpAI_F<0xA9, "test", Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32;
+ def TEST64i32 : BinOpAI_F<0xA9, "test", Xi64, RAX, "{$src, %rax|rax, $src}">;
} // isCompare
// Patterns to match a relocImm into the immediate field.
@@ -1409,20 +1411,20 @@ multiclass MulX<X86TypeInfo t, X86FoldableSchedWrite sched> {
(ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD, VEX,
VVVV, Sched<[WriteIMulH, sched]>;
let mayLoad = 1 in
- def rm : ITy<0xF6, MRMSrcMem, t, (outs t.RegClass:$dst1, t.RegClass:$dst2),
- (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD, VEX,
- VVVV, Sched<mulx_rm_sched>;
+ def rm : ITy<0xF6, MRMSrcMem, t, (outs t.RegClass:$dst1, t.RegClass:$dst2),
+ (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD, VEX,
+ VVVV, Sched<mulx_rm_sched>;
let Predicates = [In64BitMode] in {
- def rr_EVEX : ITy<0xF6, MRMSrcReg, t,
- (outs t.RegClass:$dst1, t.RegClass:$dst2),
- (ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD,
- EVEX, VVVV, Sched<[WriteIMulH, sched]>;
- let mayLoad = 1 in
- def rm_EVEX : ITy<0xF6, MRMSrcMem, t,
+ def rr_EVEX : ITy<0xF6, MRMSrcReg, t,
(outs t.RegClass:$dst1, t.RegClass:$dst2),
- (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD,
- EVEX, VVVV, Sched<mulx_rm_sched>;
+ (ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD,
+ EVEX, VVVV, Sched<[WriteIMulH, sched]>;
+ let mayLoad = 1 in
+ def rm_EVEX : ITy<0xF6, MRMSrcMem, t,
+ (outs t.RegClass:$dst1, t.RegClass:$dst2),
+ (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD,
+ EVEX, VVVV, Sched<mulx_rm_sched>;
}
// Pseudo instructions to be used when the low result isn't used. The
// instruction is defined to keep the high if both destinations are the same.
@@ -1434,10 +1436,10 @@ multiclass MulX<X86TypeInfo t, X86FoldableSchedWrite sched> {
}
let Uses = [EDX] in
-defm MULX32 : MulX<Xi32, WriteMULX32>;
+ defm MULX32 : MulX<Xi32, WriteMULX32>;
let Uses = [RDX] in
-defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W;
+ defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W;
//===----------------------------------------------------------------------===//
// ADCX and ADOX Instructions
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index e27aa4115990..7d5d7cf4a83a 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -113,6 +113,27 @@ let Predicates = [HasCMOV, HasCF] in {
(CFCMOV32rr GR32:$src1, (inv_cond_XFORM timm:$cond))>;
def : Pat<(X86cmov GR64:$src1, 0, timm:$cond, EFLAGS),
(CFCMOV64rr GR64:$src1, (inv_cond_XFORM timm:$cond))>;
+
+ def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS),
+ (CFCMOV16rm addr:$src1, timm:$cond)>;
+ def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS),
+ (CFCMOV32rm addr:$src1, timm:$cond)>;
+ def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS),
+ (CFCMOV64rm addr:$src1, timm:$cond)>;
+
+ def : Pat<(X86cload addr:$src2, GR16:$src1, timm:$cond, EFLAGS),
+ (CFCMOV16rm_ND GR16:$src1, addr:$src2, timm:$cond)>;
+ def : Pat<(X86cload addr:$src2, GR32:$src1, timm:$cond, EFLAGS),
+ (CFCMOV32rm_ND GR32:$src1, addr:$src2, timm:$cond)>;
+ def : Pat<(X86cload addr:$src2, GR64:$src1, timm:$cond, EFLAGS),
+ (CFCMOV64rm_ND GR64:$src1, addr:$src2, timm:$cond)>;
+
+ def : Pat<(X86cstore GR16:$src2, addr:$src1, timm:$cond, EFLAGS),
+ (CFCMOV16mr addr:$src1, GR16:$src2, timm:$cond)>;
+ def : Pat<(X86cstore GR32:$src2, addr:$src1, timm:$cond, EFLAGS),
+ (CFCMOV32mr addr:$src1, GR32:$src2, timm:$cond)>;
+ def : Pat<(X86cstore GR64:$src2, addr:$src1, timm:$cond, EFLAGS),
+ (CFCMOV64mr addr:$src1, GR64:$src2, timm:$cond)>;
}
// SetCC instructions.
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 6fb6e1633b0c..5a8177e2b360 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1035,27 +1035,27 @@ multiclass ATOMIC_RMW_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
(ins GR8:$val, i8mem:$ptr),
!strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
[(set GR8:$dst,
- (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
+ (!cast<PatFrag>(frag # "_i8") addr:$ptr, GR8:$val))]>;
def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
(ins GR16:$val, i16mem:$ptr),
!strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
[(set
GR16:$dst,
- (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
+ (!cast<PatFrag>(frag # "_i16") addr:$ptr, GR16:$val))]>,
OpSize16;
def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$val, i32mem:$ptr),
!strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
[(set
GR32:$dst,
- (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
+ (!cast<PatFrag>(frag # "_i32") addr:$ptr, GR32:$val))]>,
OpSize32;
def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$val, i64mem:$ptr),
!strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
[(set
GR64:$dst,
- (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
+ (!cast<PatFrag>(frag # "_i64") addr:$ptr, GR64:$val))]>;
}
}
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 162e322712a6..038100b8264d 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -15,6 +15,15 @@ def SDTX86FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>,
def SDTX86Ccmp : SDTypeProfile<1, 5,
[SDTCisVT<3, i8>, SDTCisVT<4, i8>, SDTCisVT<5, i32>]>;
+// RES = op PTR, PASSTHRU, COND, EFLAGS
+def SDTX86Cload : SDTypeProfile<1, 4,
+ [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisSameAs<0, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+// op VAL, PTR, COND, EFLAGS
+def SDTX86Cstore : SDTypeProfile<0, 4,
+ [SDTCisInt<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, i8>, SDTCisVT<3, i32>]>;
+
def SDTX86Cmov : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
@@ -144,6 +153,9 @@ def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
def X86ccmp : SDNode<"X86ISD::CCMP", SDTX86Ccmp>;
def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>;
+def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
[SDNPHasChain]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 1f93d293bc2a..069a1ec9a598 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3224,18 +3224,18 @@ int X86::getCCMPCondFlagsFromCondCode(X86::CondCode CC) {
#define GET_X86_NF_TRANSFORM_TABLE
#define GET_X86_ND2NONND_TABLE
#include "X86GenInstrMapping.inc"
-unsigned X86::getNFVariant(unsigned Opc) {
- ArrayRef<X86TableEntry> Table = ArrayRef(X86NFTransformTable);
+
+static unsigned getNewOpcFromTable(ArrayRef<X86TableEntry> Table,
+ unsigned Opc) {
const auto I = llvm::lower_bound(Table, Opc);
return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
}
+unsigned X86::getNFVariant(unsigned Opc) {
+ return getNewOpcFromTable(X86NFTransformTable, Opc);
+}
-static unsigned getNonNDVariant(unsigned Opc, const X86Subtarget &STI) {
- if (!STI.hasNDD())
- return 0U;
- ArrayRef<X86TableEntry> Table = ArrayRef(X86ND2NonNDTable);
- const auto I = llvm::lower_bound(Table, Opc);
- return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
+unsigned X86::getNonNDVariant(unsigned Opc) {
+ return getNewOpcFromTable(X86ND2NonNDTable, Opc);
}
/// Return the inverse of the specified condition,
@@ -7202,7 +7202,7 @@ static MachineInstr *fuseInst(MachineFunction &MF, unsigned Opcode,
return MIB;
}
-static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+static MachineInstr *makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
MachineInstr &MI) {
@@ -7282,6 +7282,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
}
}
break;
+ case X86::MOV32r0:
+ if (auto *NewMI =
+ makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs,
+ InsertPt, MI))
+ return NewMI;
+ break;
}
return nullptr;
@@ -7382,16 +7388,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Size, Alignment))
return CustomMI;
- if (Opc == X86::MOV32r0)
- if (auto *NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI))
- return NewMI;
-
// Folding a memory location into the two-address part of a two-address
// instruction is different than folding it other places. It requires
// replacing the *two* registers with the memory location.
//
// Utilize the mapping NonNDD -> RMW for the NDD variant.
- unsigned NonNDOpc = getNonNDVariant(Opc, Subtarget);
+ unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U;
const X86FoldTableEntry *I =
IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc)
: lookupFoldTable(Opc, OpNum);
@@ -7483,6 +7485,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
for (auto Op : Ops) {
MachineOperand &MO = MI.getOperand(Op);
auto SubReg = MO.getSubReg();
+ // MOV32r0 is special b/c it's used to clear a 64-bit register too.
+ // (See patterns for MOV32r0 in TD files).
+ if (MI.getOpcode() == X86::MOV32r0 && SubReg == X86::sub_32bit)
+ continue;
if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
return nullptr;
}
@@ -7508,7 +7514,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
switch (Opc) {
default:
// NDD can be folded into RMW though its Op0 and Op1 are not tied.
- return getNonNDVariant(Opc, Subtarget) ? Impl() : nullptr;
+ return (Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U) ? Impl()
+ : nullptr;
case X86::TEST8rr:
NewOpc = X86::CMP8ri;
RCSize = 1;
@@ -8825,6 +8832,11 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
Opcode == X86::PLDTILECFGV)
return true;
+ // Frame setup and destory can't be scheduled around.
+ if (MI.getFlag(MachineInstr::FrameSetup) ||
+ MI.getFlag(MachineInstr::FrameDestroy))
+ return true;
+
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
}
@@ -10324,7 +10336,8 @@ struct LDTLSCleanup : public MachineFunctionPass {
return false;
}
- MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ MachineDominatorTree *DT =
+ &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
return VisitNode(DT->getRootNode(), 0);
}
@@ -10411,7 +10424,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 9eb2bd56b2ab..eaa3dd089394 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -80,6 +80,9 @@ int getCCMPCondFlagsFromCondCode(CondCode CC);
// Get the opcode of corresponding NF variant.
unsigned getNFVariant(unsigned Opc);
+// Get the opcode of corresponding NonND variant.
+unsigned getNonNDVariant(unsigned Opc);
+
/// GetOppositeBranchCondition - Return the inverse of the specified cond,
/// e.g. turning COND_E to COND_NE.
CondCode GetOppositeBranchCondition(CondCode CC);
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index c4da0e50a1dd..c9ff8abb02ef 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -823,27 +823,27 @@ multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag>
!strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
[(set
GR8:$dst,
- (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>;
+ (!cast<PatFrag>(frag # "_i8") addr:$ptr, GR8:$val))]>;
def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
(ins GR16:$val, i16mem:$ptr),
!strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
[(set
GR16:$dst,
- (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>,
+ (!cast<PatFrag>(frag # "_i16") addr:$ptr, GR16:$val))]>,
OpSize16;
def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$val, i32mem:$ptr),
!strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
[(set
GR32:$dst,
- (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
+ (!cast<PatFrag>(frag # "_i32") addr:$ptr, GR32:$val))]>,
OpSize32;
def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$val, i64mem:$ptr),
!strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
[(set
GR64:$dst,
- (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>;
+ (!cast<PatFrag>(frag # "_i64") addr:$ptr, GR64:$val))]>;
}
}
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 4dfe7556df00..d5c23295ee9a 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -237,7 +237,7 @@ void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
AnalysisUsage &AU) const {
MachineFunctionPass::getAnalysisUsage(AU);
AU.addRequired<MachineLoopInfo>();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachineDominanceFrontier>();
AU.setPreservesCFG();
}
@@ -270,7 +270,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
TRI = STI->getRegisterInfo();
LLVM_DEBUG(dbgs() << "Building gadget graph...\n");
const auto &MLI = getAnalysis<MachineLoopInfo>();
- const auto &MDT = getAnalysis<MachineDominatorTree>();
+ const auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
const auto &MDF = getAnalysis<MachineDominanceFrontier>();
std::unique_ptr<MachineGadgetGraph> Graph = getGadgetGraph(MF, MLI, MDT, MDF);
LLVM_DEBUG(dbgs() << "Building gadget graph... Done\n");
@@ -439,9 +439,8 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
// Remove duplicate transmitters
llvm::sort(DefTransmitters);
- DefTransmitters.erase(
- std::unique(DefTransmitters.begin(), DefTransmitters.end()),
- DefTransmitters.end());
+ DefTransmitters.erase(llvm::unique(DefTransmitters),
+ DefTransmitters.end());
};
// Find all of the transmitters
@@ -801,7 +800,7 @@ bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToBranch(
INITIALIZE_PASS_BEGIN(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
"X86 LVI load hardening", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
"X86 LVI load hardening", false, false)
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 8f6fba8ac22c..00f58f9432e4 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -67,8 +67,8 @@ class X86MCInstLower {
public:
X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
- std::optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
- const MachineOperand &MO) const;
+ MCOperand LowerMachineOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const;
void Lower(const MachineInstr *MI, MCInst &OutMI) const;
MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
@@ -326,9 +326,8 @@ static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
return Subtarget.is64Bit() ? X86::RET64 : X86::RET32;
}
-std::optional<MCOperand>
-X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
- const MachineOperand &MO) const {
+MCOperand X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+ const MachineOperand &MO) const {
switch (MO.getType()) {
default:
MI->print(errs());
@@ -336,7 +335,7 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
case MachineOperand::MO_Register:
// Ignore all implicit register operands.
if (MO.isImplicit())
- return std::nullopt;
+ return MCOperand();
return MCOperand::createReg(MO.getReg());
case MachineOperand::MO_Immediate:
return MCOperand::createImm(MO.getImm());
@@ -355,7 +354,7 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
case MachineOperand::MO_RegisterMask:
// Ignore call clobbers.
- return std::nullopt;
+ return MCOperand();
}
}
@@ -398,8 +397,8 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.setOpcode(MI->getOpcode());
for (const MachineOperand &MO : MI->operands())
- if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
- OutMI.addOperand(*MaybeMCOp);
+ if (auto Op = LowerMachineOperand(MI, MO); Op.isValid())
+ OutMI.addOperand(Op);
bool In64BitMode = AsmPrinter.getSubtarget().is64Bit();
if (X86::optimizeInstFromVEX3ToVEX2(OutMI, MI->getDesc()) ||
@@ -867,8 +866,8 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
for (const MachineOperand &MO :
llvm::drop_begin(FaultingMI.operands(), OperandsBeginIdx))
- if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, MO))
- MI.addOperand(*MaybeOperand);
+ if (auto Op = MCIL.LowerMachineOperand(&FaultingMI, MO); Op.isValid())
+ MI.addOperand(Op);
OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
OutStreamer->emitInstruction(MI, getSubtargetInfo());
@@ -1139,9 +1138,10 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
// emit nops appropriately sized to keep the sled the same size in every
// situation.
for (unsigned I = 0; I < MI.getNumOperands(); ++I)
- if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
- assert(Op->isReg() && "Only support arguments in registers");
- SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
+ if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I));
+ Op.isValid()) {
+ assert(Op.isReg() && "Only support arguments in registers");
+ SrcRegs[I] = getX86SubSuperRegister(Op.getReg(), 64);
assert(SrcRegs[I].isValid() && "Invalid operand");
if (SrcRegs[I] != DestRegs[I]) {
UsedMask[I] = true;
@@ -1237,10 +1237,11 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
// In case the arguments are already in the correct register, we emit nops
// appropriately sized to keep the sled the same size in every situation.
for (unsigned I = 0; I < MI.getNumOperands(); ++I)
- if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
+ if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I));
+ Op.isValid()) {
// TODO: Is register only support adequate?
- assert(Op->isReg() && "Only supports arguments in registers");
- SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
+ assert(Op.isReg() && "Only supports arguments in registers");
+ SrcRegs[I] = getX86SubSuperRegister(Op.getReg(), 64);
assert(SrcRegs[I].isValid() && "Invalid operand");
if (SrcRegs[I] != DestRegs[I]) {
UsedMask[I] = true;
@@ -1354,8 +1355,8 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
MCInst Ret;
Ret.setOpcode(OpCode);
for (auto &MO : drop_begin(MI.operands()))
- if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
- Ret.addOperand(*MaybeOperand);
+ if (auto Op = MCIL.LowerMachineOperand(&MI, MO); Op.isValid())
+ Ret.addOperand(Op);
OutStreamer->emitInstruction(Ret, getSubtargetInfo());
emitX86Nops(*OutStreamer, 10, Subtarget);
recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2);
@@ -1417,8 +1418,8 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
// indeed a tail call.
OutStreamer->AddComment("TAILCALL");
for (auto &MO : TCOperands)
- if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
- TC.addOperand(*MaybeOperand);
+ if (auto Op = MCIL.LowerMachineOperand(&MI, MO); Op.isValid())
+ TC.addOperand(Op);
OutStreamer->emitInstruction(TC, getSubtargetInfo());
if (IsConditional)
@@ -1898,6 +1899,62 @@ static void addConstantComments(const MachineInstr *MI,
break;
}
+#define INSTR_CASE(Prefix, Instr, Suffix, Postfix) \
+ case X86::Prefix##Instr##Suffix##rm##Postfix:
+
+#define CASE_ARITH_RM(Instr) \
+ INSTR_CASE(, Instr, , ) /* SSE */ \
+ INSTR_CASE(V, Instr, , ) /* AVX-128 */ \
+ INSTR_CASE(V, Instr, Y, ) /* AVX-256 */ \
+ INSTR_CASE(V, Instr, Z128, ) \
+ INSTR_CASE(V, Instr, Z128, k) \
+ INSTR_CASE(V, Instr, Z128, kz) \
+ INSTR_CASE(V, Instr, Z256, ) \
+ INSTR_CASE(V, Instr, Z256, k) \
+ INSTR_CASE(V, Instr, Z256, kz) \
+ INSTR_CASE(V, Instr, Z, ) \
+ INSTR_CASE(V, Instr, Z, k) \
+ INSTR_CASE(V, Instr, Z, kz)
+
+ // TODO: Add additional instructions when useful.
+ CASE_ARITH_RM(PMADDUBSW) {
+ unsigned SrcIdx = getSrcIdx(MI, 1);
+ if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
+ if (C->getType()->getScalarSizeInBits() == 8) {
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+ unsigned VectorWidth =
+ X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
+ CS << "[";
+ printConstant(C, VectorWidth, CS);
+ CS << "]";
+ OutStreamer.AddComment(CS.str());
+ }
+ }
+ break;
+ }
+
+ CASE_ARITH_RM(PMADDWD)
+ CASE_ARITH_RM(PMULLW)
+ CASE_ARITH_RM(PMULHW)
+ CASE_ARITH_RM(PMULHUW)
+ CASE_ARITH_RM(PMULHRSW) {
+ unsigned SrcIdx = getSrcIdx(MI, 1);
+ if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
+ if (C->getType()->getScalarSizeInBits() == 16) {
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+ unsigned VectorWidth =
+ X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
+ CS << "[";
+ printConstant(C, VectorWidth, CS);
+ CS << "]";
+ OutStreamer.AddComment(CS.str());
+ }
+ }
+ break;
+ }
+
#define MASK_AVX512_CASE(Instr) \
case Instr: \
case Instr##k: \
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
index 2e88e01ce7fd..7b57f7c23bf4 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -13,6 +13,14 @@
using namespace llvm;
+yaml::X86MachineFunctionInfo::X86MachineFunctionInfo(
+ const llvm::X86MachineFunctionInfo &MFI)
+ : AMXProgModel(MFI.getAMXProgModel()) {}
+
+void yaml::X86MachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
+ MappingTraits<X86MachineFunctionInfo>::mapping(YamlIO, *this);
+}
+
MachineFunctionInfo *X86MachineFunctionInfo::clone(
BumpPtrAllocator &Allocator, MachineFunction &DestMF,
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
@@ -20,6 +28,11 @@ MachineFunctionInfo *X86MachineFunctionInfo::clone(
return DestMF.cloneInfo<X86MachineFunctionInfo>(*this);
}
+void X86MachineFunctionInfo::initializeBaseYamlFields(
+ const yaml::X86MachineFunctionInfo &YamlMFI) {
+ AMXProgModel = YamlMFI.AMXProgModel;
+}
+
void X86MachineFunctionInfo::anchor() { }
void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 8aaa49945f9d..315aeef65d28 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -16,13 +16,43 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/YAMLTraits.h"
#include <set>
namespace llvm {
enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 };
+class X86MachineFunctionInfo;
+
+namespace yaml {
+template <> struct ScalarEnumerationTraits<AMXProgModelEnum> {
+ static void enumeration(IO &YamlIO, AMXProgModelEnum &Value) {
+ YamlIO.enumCase(Value, "None", AMXProgModelEnum::None);
+ YamlIO.enumCase(Value, "DirectReg", AMXProgModelEnum::DirectReg);
+ YamlIO.enumCase(Value, "ManagedRA", AMXProgModelEnum::ManagedRA);
+ }
+};
+
+struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo {
+ AMXProgModelEnum AMXProgModel;
+
+ X86MachineFunctionInfo() = default;
+ X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI);
+
+ void mappingImpl(yaml::IO &YamlIO) override;
+ ~X86MachineFunctionInfo() = default;
+};
+
+template <> struct MappingTraits<X86MachineFunctionInfo> {
+ static void mapping(IO &YamlIO, X86MachineFunctionInfo &MFI) {
+ YamlIO.mapOptional("amxProgModel", MFI.AMXProgModel);
+ }
+};
+} // end namespace yaml
+
/// X86MachineFunctionInfo - This class is derived from MachineFunction and
/// contains private X86 target-specific information for each MachineFunction.
class X86MachineFunctionInfo : public MachineFunctionInfo {
@@ -119,10 +149,6 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// other tools to detect the extended record.
bool HasSwiftAsyncContext = false;
- /// True if this function has tile virtual register. This is used to
- /// determine if we should insert tilerelease in frame lowering.
- bool HasVirtualTileReg = false;
-
/// Ajust stack for push2/pop2
bool PadForPush2Pop2 = false;
@@ -160,6 +186,8 @@ public:
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
const override;
+ void initializeBaseYamlFields(const yaml::X86MachineFunctionInfo &YamlMFI);
+
bool getForceFramePointer() const { return ForceFramePointer;}
void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
@@ -250,9 +278,6 @@ public:
bool hasSwiftAsyncContext() const { return HasSwiftAsyncContext; }
void setHasSwiftAsyncContext(bool v) { HasSwiftAsyncContext = v; }
- bool hasVirtualTileReg() const { return HasVirtualTileReg; }
- void setHasVirtualTileReg(bool v) { HasVirtualTileReg = v; }
-
bool padForPush2Pop2() const { return PadForPush2Pop2; }
void setPadForPush2Pop2(bool V) { PadForPush2Pop2 = V; }
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 75ad58e5cdcb..ecf923c0e2a2 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -237,11 +237,15 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
}
bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ // Early exit in the common case of non-AMX code.
+ if (X86FI->getAMXProgModel() != AMXProgModelEnum::ManagedRA)
+ return false;
+
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
const TargetInstrInfo *TII = ST.getInstrInfo();
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
BitVector AMXRegs(TRI->getNumRegs());
for (unsigned I = 0; I < RC->getNumRegs(); I++)
@@ -301,7 +305,6 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
// There's no AMX instruction if we didn't find a tile config live in point.
if (CfgNeedInsert.empty())
return false;
- X86FI->setHasVirtualTileReg(true);
// Avoid to insert ldtilecfg before any shape defs.
SmallVector<MachineBasicBlock *, 8> WorkList;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 7107dbc63e27..420c42928c1c 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1631,17 +1631,25 @@ def : InstRW<[Zn4WriteFCmp64], (instregex
)>;
// MOV Instructions
-def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> {
+def Zn4MOVDUPZ: SchedWriteRes<[Zn4FPFMisc12]> {
let Latency = 2;
let ReleaseAtCycles = [2];
let NumMicroOps = 1;
}
+def : InstRW<[Zn4MOVDUPZ], (instregex
+ "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)"
+ )>;
+
+def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
+}
def : InstRW<[Zn4MOVS], (instregex
"(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)",
"(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)",
"(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)",
- "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)",
- "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)"
+ "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?|Z256?)(rr|rrk|rrkz)"
)>;
def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> {
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 489c8f492524..46317cb33776 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -822,8 +822,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
// Sort and unique the codes to minimize them.
llvm::sort(UncondCodeSeq);
- UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
- UncondCodeSeq.end());
+ UncondCodeSeq.erase(llvm::unique(UncondCodeSeq), UncondCodeSeq.end());
// Build a checking version of the successor.
BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 27542e54829b..d4e642c7df9c 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -31,6 +31,8 @@
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
@@ -344,6 +346,24 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
return I.get();
}
+yaml::MachineFunctionInfo *X86TargetMachine::createDefaultFuncInfoYAML() const {
+ return new yaml::X86MachineFunctionInfo();
+}
+
+yaml::MachineFunctionInfo *
+X86TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
+ const auto *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ return new yaml::X86MachineFunctionInfo(*MFI);
+}
+
+bool X86TargetMachine::parseMachineFunctionInfo(
+ const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error, SMRange &SourceRange) const {
+ const auto &YamlMFI = static_cast<const yaml::X86MachineFunctionInfo &>(MFI);
+ PFS.MF.getInfo<X86MachineFunctionInfo>()->initializeBaseYamlFields(YamlMFI);
+ return false;
+}
+
bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
assert(SrcAS != DestAS && "Expected different address spaces!");
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index 4a5f20fcc017..ec4a93e9c9d4 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -58,8 +58,15 @@ public:
createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool PopulateClassToPassNames) override;
+ yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
+ yaml::MachineFunctionInfo *
+ convertFuncInfoToYAML(const MachineFunction &MF) const override;
+ bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
+ PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error,
+ SMRange &SourceRange) const override;
+
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
raw_pwrite_stream *, CodeGenFileType,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 0a23bf251676..bb0270c018c9 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -176,6 +176,27 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
return 8;
}
+bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty) const {
+ if (!ST->hasCF())
+ return false;
+ if (!Ty)
+ return true;
+ // Conditional faulting is supported by CFCMOV, which only accepts
+ // 16/32/64-bit operands.
+ // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
+ // profitable.
+ if (!Ty->isIntegerTy())
+ return false;
+ switch (cast<IntegerType>(Ty)->getBitWidth()) {
+ default:
+ return false;
+ case 16:
+ case 32:
+ case 64:
+ return true;
+ }
+}
+
TypeSize
X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
unsigned PreferVectorWidth = ST->getPreferVectorWidth();
@@ -851,7 +872,9 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
{ ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
- { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
+ { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
+ { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
+ { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
{ ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
{ ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
@@ -1117,7 +1140,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
{ ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
- { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
+ { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
{ ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
{ ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
{ ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
@@ -1168,7 +1191,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
// We don't have to scalarize unsupported ops. We can issue two half-sized
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
- { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
+ { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
+ { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
{ ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
{ ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
{ ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
@@ -1308,7 +1332,6 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
- { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
{ ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
};
@@ -1317,6 +1340,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
+ static const CostKindTblEntry SSSE3CostTable[] = {
+ { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
+ };
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
+ if (auto KindCost = Entry->Cost[CostKind])
+ return LT.first * *KindCost;
+
static const CostKindTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
@@ -1353,7 +1385,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
{ ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
- { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
+ { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
{ ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
{ ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
{ ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
@@ -4061,7 +4093,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
};
static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
- { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV
+ { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
{ ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
{ ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV
@@ -4082,9 +4114,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
};
static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
- { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
- { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV
- { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA
+ { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
+ { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
+ { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
{ ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
{ ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
{ ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
@@ -4259,6 +4291,37 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
if (ISD != ISD::DELETED_NODE) {
+ auto adjustTableCost = [&](int ISD, unsigned Cost,
+ std::pair<InstructionCost, MVT> LT,
+ FastMathFlags FMF) -> InstructionCost {
+ InstructionCost LegalizationCost = LT.first;
+ MVT MTy = LT.second;
+
+ // If there are no NANs to deal with, then these are reduced to a
+ // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
+ // assume is used in the non-fast case.
+ if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
+ if (FMF.noNaNs())
+ return LegalizationCost * 1;
+ }
+
+ // For cases where some ops can be folded into a load/store, assume free.
+ if (MTy.isScalarInteger()) {
+ if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
+ if (const Instruction *II = ICA.getInst()) {
+ if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
+ return TTI::TCC_Free;
+ if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
+ if (LI->hasOneUse())
+ return TTI::TCC_Free;
+ }
+ }
+ }
+ }
+
+ return LegalizationCost * (int)Cost;
+ };
+
// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
MVT MTy = LT.second;
@@ -4277,180 +4340,132 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
return LT.first;
- auto adjustTableCost = [](int ISD, unsigned Cost,
- InstructionCost LegalizationCost,
- FastMathFlags FMF) {
- // If there are no NANs to deal with, then these are reduced to a
- // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
- // assume is used in the non-fast case.
- if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
- if (FMF.noNaNs())
- return LegalizationCost * 1;
- }
- return LegalizationCost * (int)Cost;
- };
-
if (ST->useGLMDivSqrtCosts())
if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->useSLMArithCosts())
if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasVBMI2())
if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasBITALG())
if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasVPOPCNTDQ())
if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasGFNI())
if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasCDI())
if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasXOP())
if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasSSE41())
if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasSSSE3())
if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasSSE1())
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (ST->hasBMI()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
}
if (ST->hasLZCNT()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
}
if (ST->hasPOPCNT()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
- }
-
- if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
- if (const Instruction *II = ICA.getInst()) {
- if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
- return TTI::TCC_Free;
- if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
- if (LI->hasOneUse())
- return TTI::TCC_Free;
- }
- }
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
}
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first,
- ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
- return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags());
+ return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -5076,7 +5091,12 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
auto VT = TLI->getValueType(DL, SrcVTy);
InstructionCost Cost = 0;
- if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
+ MVT Ty = LT.second;
+ if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
+ // APX masked load/store for scalar is cheap.
+ return Cost + LT.first;
+
+ if (VT.isSimple() && Ty != VT.getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
// Promotion requires extend/truncate for data and a shuffle for mask.
Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
@@ -5084,9 +5104,9 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
CostKind, 0, nullptr);
- else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
+ else if (LT.first * Ty.getVectorNumElements() > NumElem) {
auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
- LT.second.getVectorNumElements());
+ Ty.getVectorNumElements());
// Expanding requires fill mask with zeroes
Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
CostKind, 0, MaskTy);
@@ -5905,14 +5925,14 @@ bool X86TTIImpl::canMacroFuseCmp() {
}
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
- if (!ST->hasAVX())
- return false;
+ Type *ScalarTy = DataTy->getScalarType();
- // The backend can't handle a single element vector.
- if (isa<VectorType>(DataTy) &&
- cast<FixedVectorType>(DataTy)->getNumElements() == 1)
+ // The backend can't handle a single element vector w/o CFCMOV.
+ if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
+ return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
+
+ if (!ST->hasAVX())
return false;
- Type *ScalarTy = DataTy->getScalarType();
if (ScalarTy->isPointerTy())
return true;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index e14dc9fc0905..e6bb4720071d 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,6 +132,7 @@ public:
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const;
+ bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
unsigned getMaxInterleaveFactor(ElementCount VF);
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
index 5cada924e006..ebe48910225f 100644
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -77,6 +77,11 @@ INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false,
false)
bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ // Early exit in the common case of non-AMX code.
+ if (X86FI->getAMXProgModel() != AMXProgModelEnum::ManagedRA)
+ return false;
+
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
const TargetInstrInfo *TII = ST.getInstrInfo();
diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 5e91cce1068b..7503bf1561cc 100644
--- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -79,7 +79,7 @@ static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
do {
SmallVector<WeakTrackingVH, 8> WUsers(CE->users());
llvm::sort(WUsers);
- WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
+ WUsers.erase(llvm::unique(WUsers), WUsers.end());
while (!WUsers.empty())
if (WeakTrackingVH WU = WUsers.pop_back_val()) {
if (PHINode *PN = dyn_cast<PHINode>(WU)) {
diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
index eaf046630299..b0ce624a495f 100644
--- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
+++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
@@ -572,7 +572,7 @@ ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
case AsmToken::Integer:
if (!SR)
return ParseStatus::NoMatch;
- RegName = StringRef(std::to_string(getLexer().getTok().getIntVal()));
+ RegName = getLexer().getTok().getString();
RegNo = MatchRegisterName(RegName);
if (RegNo == 0)
RegNo = MatchRegisterAltName(RegName);
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
index e222919b28dc..3f99387f759d 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -22,6 +22,7 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
@@ -42,9 +43,20 @@ getModifierVariantKind(XtensaCP::XtensaCPModifier Modifier) {
}
void XtensaAsmPrinter::emitInstruction(const MachineInstr *MI) {
- MCInst LoweredMI;
- lowerToMCInst(MI, LoweredMI);
- EmitToStreamer(*OutStreamer, LoweredMI);
+ unsigned Opc = MI->getOpcode();
+
+ switch (Opc) {
+ case Xtensa::BR_JT:
+ EmitToStreamer(
+ *OutStreamer,
+ MCInstBuilder(Xtensa::JX).addReg(MI->getOperand(0).getReg()));
+ return;
+ default:
+ MCInst LoweredMI;
+ lowerToMCInst(MI, LoweredMI);
+ EmitToStreamer(*OutStreamer, LoweredMI);
+ return;
+ }
}
void XtensaAsmPrinter::emitMachineConstantPoolValue(
@@ -52,16 +64,27 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
XtensaConstantPoolValue *ACPV = static_cast<XtensaConstantPoolValue *>(MCPV);
MCSymbol *MCSym;
- assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
-
- XtensaConstantPoolSymbol *XtensaSym = cast<XtensaConstantPoolSymbol>(ACPV);
- const char *Sym = XtensaSym->getSymbol();
- std::string SymName(Sym);
-
- if (XtensaSym->isPrivateLinkage())
- SymName = ".L" + SymName;
+ if (ACPV->isBlockAddress()) {
+ const BlockAddress *BA =
+ cast<XtensaConstantPoolConstant>(ACPV)->getBlockAddress();
+ MCSym = GetBlockAddressSymbol(BA);
+ } else if (ACPV->isJumpTable()) {
+ unsigned Idx = cast<XtensaConstantPoolJumpTable>(ACPV)->getIndex();
+ MCSym = this->GetJTISymbol(Idx, false);
+ } else {
+ assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
+ XtensaConstantPoolSymbol *XtensaSym = cast<XtensaConstantPoolSymbol>(ACPV);
+ const char *SymName = XtensaSym->getSymbol();
+
+ if (XtensaSym->isPrivateLinkage()) {
+ const DataLayout &DL = getDataLayout();
+ MCSym = OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+ SymName);
+ } else {
+ MCSym = OutContext.getOrCreateSymbol(SymName);
+ }
+ }
- MCSym = GetExternalSymbolSymbol(StringRef(SymName));
MCSymbol *LblSym = GetCPISymbol(ACPV->getLabelId());
auto *TS =
static_cast<XtensaTargetStreamer *>(OutStreamer->getTargetStreamer());
@@ -71,7 +94,7 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
std::string SymName(MCSym->getName());
StringRef Modifier = ACPV->getModifierText();
SymName += Modifier;
- MCSym = GetExternalSymbolSymbol(StringRef(SymName));
+ MCSym = OutContext.getOrCreateSymbol(SymName);
}
const MCExpr *Expr = MCSymbolRefExpr::create(MCSym, VK, OutContext);
@@ -140,6 +163,10 @@ XtensaAsmPrinter::GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
return GetCPISymbol(MO.getIndex());
}
+MCSymbol *XtensaAsmPrinter::GetJumpTableSymbol(const MachineOperand &MO) const {
+ return GetJTISymbol(MO.getIndex());
+}
+
MCOperand
XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
MachineOperand::MachineOperandType MOTy,
@@ -152,6 +179,20 @@ XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
Symbol = getSymbol(MO.getGlobal());
Offset += MO.getOffset();
break;
+ case MachineOperand::MO_MachineBasicBlock:
+ Symbol = MO.getMBB()->getSymbol();
+ break;
+ case MachineOperand::MO_BlockAddress:
+ Symbol = GetBlockAddressSymbol(MO.getBlockAddress());
+ Offset += MO.getOffset();
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ Symbol = GetExternalSymbolSymbol(MO.getSymbolName());
+ Offset += MO.getOffset();
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ Symbol = GetJumpTableSymbol(MO);
+ break;
case MachineOperand::MO_ConstantPoolIndex:
Symbol = GetConstantPoolIndexSymbol(MO);
Offset += MO.getOffset();
@@ -191,6 +232,10 @@ MCOperand XtensaAsmPrinter::lowerOperand(const MachineOperand &MO,
case MachineOperand::MO_RegisterMask:
break;
case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_BlockAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_JumpTableIndex:
case MachineOperand::MO_ConstantPoolIndex:
return LowerSymbolOperand(MO, MOTy, Offset);
default:
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
index f3fec19724aa..f9cf5ae8c9f6 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
@@ -44,6 +44,8 @@ public:
MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+
MCOperand LowerSymbolOperand(const MachineOperand &MO,
MachineOperand::MachineOperandType MOTy,
unsigned Offset) const;
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index 6c3258b4bb46..650979301250 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -77,6 +77,18 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::ConstantPool, PtrVT, Custom);
+ setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+ setOperationAction(ISD::BlockAddress, PtrVT, Custom);
+ setOperationAction(ISD::JumpTable, PtrVT, Custom);
+
+ // Expand jump table branches as address arithmetic followed by an
+ // indirect jump.
+ setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+ setOperationPromotedToType(ISD::BR_CC, MVT::i1, MVT::i32);
+ setOperationAction(ISD::BR_CC, MVT::i32, Legal);
+ setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f32, Expand);
// Implement custom stack allocations
setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
@@ -88,6 +100,12 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
computeRegisterProperties(STI.getRegisterInfo());
}
+bool XtensaTargetLowering::isOffsetFoldingLegal(
+ const GlobalAddressSDNode *GA) const {
+ // The Xtensa target isn't yet aware of offsets.
+ return false;
+}
+
//===----------------------------------------------------------------------===//
// Calling conventions
//===----------------------------------------------------------------------===//
@@ -519,6 +537,72 @@ SDValue XtensaTargetLowering::LowerImmediate(SDValue Op,
return Op;
}
+SDValue XtensaTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+ SDLoc DL(Op);
+ auto PtrVT = Op.getValueType();
+ const GlobalValue *GV = G->getGlobal();
+
+ SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
+ SDValue CPWrap = getAddrPCRel(CPAddr, DAG);
+
+ return CPWrap;
+}
+
+SDValue XtensaTargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ BlockAddressSDNode *Node = cast<BlockAddressSDNode>(Op);
+ const BlockAddress *BA = Node->getBlockAddress();
+ EVT PtrVT = Op.getValueType();
+
+ XtensaConstantPoolValue *CPV =
+ XtensaConstantPoolConstant::Create(BA, 0, XtensaCP::CPBlockAddress);
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+ SDValue CPWrap = getAddrPCRel(CPAddr, DAG);
+
+ return CPWrap;
+}
+
+SDValue XtensaTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Table = Op.getOperand(1);
+ SDValue Index = Op.getOperand(2);
+ SDLoc DL(Op);
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+ SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+ const DataLayout &TD = DAG.getDataLayout();
+ EVT PtrVT = Table.getValueType();
+ unsigned EntrySize = MJTI->getEntrySize(TD);
+
+ Index = DAG.getNode(ISD::MUL, DL, Index.getValueType(), Index,
+ DAG.getConstant(EntrySize, DL, Index.getValueType()));
+ SDValue Addr = DAG.getNode(ISD::ADD, DL, Index.getValueType(), Index, Table);
+ SDValue LD =
+ DAG.getLoad(PtrVT, DL, Chain, Addr,
+ MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
+
+ return DAG.getNode(XtensaISD::BR_JT, DL, MVT::Other, LD.getValue(1), LD,
+ TargetJT);
+}
+
+SDValue XtensaTargetLowering::LowerJumpTable(SDValue Op,
+ SelectionDAG &DAG) const {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+ EVT PtrVT = Op.getValueType();
+
+ // Create a constant pool entry for the callee address
+ XtensaConstantPoolValue *CPV =
+ XtensaConstantPoolJumpTable::Create(*DAG.getContext(), JT->getIndex());
+
+ // Get the address of the callee into a register
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+
+ return getAddrPCRel(CPAddr, DAG);
+}
+
SDValue XtensaTargetLowering::getAddrPCRel(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -580,8 +664,16 @@ SDValue XtensaTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue XtensaTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
+ case ISD::BR_JT:
+ return LowerBR_JT(Op, DAG);
case ISD::Constant:
return LowerImmediate(Op, DAG);
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::BlockAddress:
+ return LowerBlockAddress(Op, DAG);
+ case ISD::JumpTable:
+ return LowerJumpTable(Op, DAG);
case ISD::ConstantPool:
return LowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
case ISD::STACKSAVE:
@@ -597,6 +689,8 @@ SDValue XtensaTargetLowering::LowerOperation(SDValue Op,
const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch (Opcode) {
+ case XtensaISD::BR_JT:
+ return "XtensaISD::BR_JT";
case XtensaISD::CALL:
return "XtensaISD::CALL";
case XtensaISD::PCREL_WRAPPER:
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
index 6f6ec391430a..23a0217daaa9 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.h
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
@@ -23,6 +23,7 @@ namespace llvm {
namespace XtensaISD {
enum {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ BR_JT,
// Calls a function. Operand 0 is the chain operand and operand 1
// is the target address. The arguments start at operand 2.
@@ -43,6 +44,8 @@ public:
explicit XtensaTargetLowering(const TargetMachine &TM,
const XtensaSubtarget &STI);
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
const char *getTargetNodeName(unsigned Opcode) const override;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -71,8 +74,16 @@ public:
private:
const XtensaSubtarget &Subtarget;
+ SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerImmediate(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index 6e9e75257ccf..f68d20dcdd54 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -416,6 +416,15 @@ def BBSI : RRI8_Inst<0x07, (outs),
let imm8 = target;
}
+def : Pat<(brcc SETGT, AR:$s, AR:$t, bb:$target),
+ (BLT AR:$t, AR:$s, bb:$target)>;
+def : Pat<(brcc SETUGT, AR:$s, AR:$t, bb:$target),
+ (BLTU AR:$t, AR:$s, bb:$target)>;
+def : Pat<(brcc SETLE, AR:$s, AR:$t, bb:$target),
+ (BGE AR:$t, AR:$s, bb:$target)>;
+def : Pat<(brcc SETULE, AR:$s, AR:$t, bb:$target),
+ (BGEU AR:$t, AR:$s, bb:$target)>;
+
//===----------------------------------------------------------------------===//
// Call and jump instructions
//===----------------------------------------------------------------------===//
@@ -471,6 +480,12 @@ def : Pat<(Xtensa_call (i32 texternalsym:$dst)),
def : Pat<(Xtensa_call AR:$dst),
(CALLX0 AR:$dst)>;
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1, Size = 3 in {
+ def BR_JT: Pseudo<(outs), (ins AR:$s, i32imm:$jt),
+ "!br_jt_p, $s, $jt",
+ [(Xtensa_brjt AR:$s, tjumptable:$jt)]>;
+}
+
//===----------------------------------------------------------------------===//
// Mem barrier instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Xtensa/XtensaOperators.td b/llvm/lib/Target/Xtensa/XtensaOperators.td
index cd4d831c85b5..88d3c9dfe7fd 100644
--- a/llvm/lib/Target/Xtensa/XtensaOperators.td
+++ b/llvm/lib/Target/Xtensa/XtensaOperators.td
@@ -17,6 +17,8 @@ def SDT_XtensaWrapPtr : SDTypeProfile<1, 1,
[SDTCisSameAs<0, 1>,
SDTCisPtrTy<0>]>;
+def SDT_XtensaBrJT : SDTypeProfile<0, 2,
+ [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
//===----------------------------------------------------------------------===//
// Node definitions
//===----------------------------------------------------------------------===//
@@ -34,3 +36,5 @@ def Xtensa_callseq_start: SDNode<"ISD::CALLSEQ_START", SDT_XtensaCallSeqStart,
def Xtensa_callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_XtensaCallSeqEnd,
[SDNPHasChain, SDNPSideEffect, SDNPOptInGlue,
SDNPOutGlue]>;
+
+def Xtensa_brjt: SDNode<"XtensaISD::BR_JT", SDT_XtensaBrJT, [SDNPHasChain]>;