summaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorMingming Liu <mingmingl@google.com>2025-09-10 15:25:31 -0700
committerGitHub <noreply@github.com>2025-09-10 15:25:31 -0700
commit1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/lib/Target
parent898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parentb8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp137
-rw-r--r--llvm/lib/Target/AArch64/AArch64BranchTargets.cpp46
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64Features.td31
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp926
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.h70
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp28
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp617
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h14
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td111
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp112
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td174
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h18
-rw-r--r--llvm/lib/Target/AArch64/AArch64MacroFusion.cpp42
-rw-r--r--llvm/lib/Target/AArch64/AArch64Processors.td119
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp794
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h111
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td13
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td9
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SystemOperands.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp85
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h9
-rw-r--r--llvm/lib/Target/AArch64/CMakeLists.txt5
-rw-r--r--llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp466
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp39
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp15
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp18
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp7
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp14
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h5
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp14
-rw-r--r--llvm/lib/Target/AArch64/MachineSMEABIPass.cpp198
-rw-r--r--llvm/lib/Target/AArch64/SMEABIPass.cpp3
-rw-r--r--llvm/lib/Target/AArch64/SMEInstrFormats.td4
-rw-r--r--llvm/lib/Target/AArch64/SVEInstrFormats.td62
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td46
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp74
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp42
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp58
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp161
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp373
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h28
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp36
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp305
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp52
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h20
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td299
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt7
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td1093
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp96
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h41
-rw-r--r--llvm/lib/Target/AMDGPU/EvergreenInstructions.td3
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td710
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h16
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp126
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h10
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp48
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h23
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td152
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600Instructions.td9
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h33
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp179
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp677
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp259
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h55
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td92
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td221
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp72
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp200
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIPostRABundler.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp71
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h7
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td252
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td3
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td22
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp145
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h29
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h1
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td41
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td24
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td14
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td20
-rw-r--r--llvm/lib/Target/AMDGPU/VOPDInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td10
-rw-r--r--llvm/lib/Target/ARC/ARCInstrFormats.td2
-rw-r--r--llvm/lib/Target/ARC/ARCInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/ARC/ARCInstrInfo.td2
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp8
-rw-r--r--llvm/lib/Target/ARM/ARMBlockPlacement.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp170
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h10
-rw-r--r--llvm/lib/Target/ARM/ARMInstrFormats.td2
-rw-r--r--llvm/lib/Target/ARM/ARMInstrInfo.td9
-rw-r--r--llvm/lib/Target/ARM/ARMInstrNEON.td4
-rw-r--r--llvm/lib/Target/ARM/ARMInstrThumb.td8
-rw-r--r--llvm/lib/Target/ARM/ARMInstrThumb2.td23
-rw-r--r--llvm/lib/Target/ARM/ARMLegalizerInfo.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp9
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/ARM/CMakeLists.txt3
-rw-r--r--llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp2131
-rw-r--r--llvm/lib/Target/ARM/MVETailPredication.cpp3
-rw-r--r--llvm/lib/Target/ARM/README.txt16
-rw-r--r--llvm/lib/Target/ARM/Thumb2InstrInfo.cpp7
-rw-r--r--llvm/lib/Target/AVR/AVRAsmPrinter.cpp4
-rw-r--r--llvm/lib/Target/AVR/AVRInstrFormats.td95
-rw-r--r--llvm/lib/Target/AVR/AVRInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/AVR/AVRInstrInfo.h2
-rw-r--r--llvm/lib/Target/AVR/AVRInstrInfo.td13
-rw-r--r--llvm/lib/Target/AVR/AVRRegisterInfo.td40
-rw-r--r--llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp9
-rw-r--r--llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp248
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp17
-rw-r--r--llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp21
-rw-r--r--llvm/lib/Target/BPF/BPFInstrInfo.cpp5
-rw-r--r--llvm/lib/Target/BPF/BPFInstrInfo.h3
-rw-r--r--llvm/lib/Target/BPF/BPFInstrInfo.td10
-rw-r--r--llvm/lib/Target/BPF/BPFSubtarget.cpp2
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp2
-rw-r--r--llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp12
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrFormats.td28
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td31
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td24
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td17
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrInfo.cpp5
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrInfo.h2
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrInfo.td83
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td31
-rw-r--r--llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp138
-rw-r--r--llvm/lib/Target/DirectX/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/DirectX/DXContainerGlobals.cpp17
-rw-r--r--llvm/lib/Target/DirectX/DXILDataScalarization.cpp4
-rw-r--r--llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp4
-rw-r--r--llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp155
-rw-r--r--llvm/lib/Target/DirectX/DXILOpLowering.cpp15
-rw-r--r--llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp89
-rw-r--r--llvm/lib/Target/DirectX/DXILResourceAccess.cpp129
-rw-r--r--llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.cpp87
-rw-r--r--llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp2
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp2
-rw-r--r--llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp27
-rw-r--r--llvm/lib/Target/DirectX/DirectXInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/DirectX/DirectXInstrInfo.h4
-rw-r--r--llvm/lib/Target/DirectX/DirectXSubtarget.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DirectXSubtarget.h2
-rw-r--r--llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp65
-rw-r--r--llvm/lib/Target/Hexagon/Hexagon.td7
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td128
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp7
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.h2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp9
-rw-r--r--llvm/lib/Target/Hexagon/HexagonOperands.td10
-rw-r--r--llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp186
-rw-r--r--llvm/lib/Target/Lanai/LanaiInstrInfo.cpp5
-rw-r--r--llvm/lib/Target/Lanai/LanaiInstrInfo.h4
-rw-r--r--llvm/lib/Target/Lanai/LanaiInstrInfo.td3
-rw-r--r--llvm/lib/Target/Lanai/LanaiSubtarget.cpp4
-rw-r--r--llvm/lib/Target/Lanai/LanaiSubtarget.h2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArch.td2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchCallingConv.td4
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td14
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp13
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp590
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.h11
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.h2
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.td62
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td24
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td18
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp4
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp16
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp80
-rw-r--r--llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp12
-rw-r--r--llvm/lib/Target/M68k/M68kInstrAtomics.td11
-rw-r--r--llvm/lib/Target/M68k/M68kInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp5
-rw-r--r--llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp11
-rw-r--r--llvm/lib/Target/MSP430/MSP430ISelLowering.cpp24
-rw-r--r--llvm/lib/Target/MSP430/MSP430InstrInfo.cpp7
-rw-r--r--llvm/lib/Target/MSP430/MSP430InstrInfo.h2
-rw-r--r--llvm/lib/Target/Mips/CMakeLists.txt3
-rw-r--r--llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp1274
-rw-r--r--llvm/lib/Target/Mips/Mips16ISelLowering.cpp72
-rw-r--r--llvm/lib/Target/Mips/Mips16InstrInfo.td5
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/Mips/MipsInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTX.td10
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp13
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp58
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp503
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h15
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.h3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td20
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td45
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.h8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp6
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXUtilities.cpp32
-rw-r--r--llvm/lib/Target/PowerPC/CMakeLists.txt2
-rw-r--r--llvm/lib/Target/PowerPC/PPC.h4
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp12
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp44
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.h5
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstr64Bit.td52
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrAltivec.td15
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFormats.td948
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFuture.td190
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td809
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.cpp47
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.h4
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.td98
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrMMA.td108
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrP10.td479
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrSPE.td54
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrVSX.td72
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.cpp21
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.h16
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetMachine.cpp4
-rw-r--r--llvm/lib/Target/PowerPC/PPCVSXCopy.cpp159
-rw-r--r--llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp182
-rw-r--r--llvm/lib/Target/PowerPC/README_P9.txt18
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp27
-rw-r--r--llvm/lib/Target/RISCV/CMakeLists.txt4
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp58
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp4
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp22
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp6
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h3
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp42
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td20
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp19
-rw-r--r--llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp137
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp424
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrFormatsC.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp29
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoC.td200
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td310
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td216
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td155
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td23
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td34
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td12
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td15
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td31
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td100
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrPredicates.td8
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp47
-rw-r--r--llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp26
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp16
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp167
-rw-r--r--llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp24
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.h3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstrInfo.td4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp22
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp2
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp6
-rw-r--r--llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp91
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp12
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/Sparc/Sparc.td3
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp62
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.cpp6
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.h2
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.td8
-rw-r--r--llvm/lib/Target/Sparc/SparcSubtarget.cpp13
-rw-r--r--llvm/lib/Target/Sparc/SparcSubtarget.h9
-rw-r--r--llvm/lib/Target/Sparc/SparcTargetMachine.cpp24
-rw-r--r--llvm/lib/Target/Sparc/SparcTargetMachine.h3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFeatures.td3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp24
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrFormats.td60
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.h4
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrVector.td119
-rw-r--r--llvm/lib/Target/SystemZ/SystemZOperators.td25
-rw-r--r--llvm/lib/Target/TargetLoweringObjectFile.cpp10
-rw-r--r--llvm/lib/Target/TargetMachine.cpp1
-rw-r--r--llvm/lib/Target/TargetMachineC.cpp3
-rw-r--r--llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp150
-rw-r--r--llvm/lib/Target/VE/VEInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/VE/VEInstrInfo.h2
-rw-r--r--llvm/lib/Target/VE/VEInstrInfo.td26
-rw-r--r--llvm/lib/Target/VE/VEInstrVec.td88
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISD.def1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp74
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td11
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp125
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp53
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp11
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp7
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp3
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp12
-rw-r--r--llvm/lib/Target/X86/X86.td26
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp7
-rw-r--r--llvm/lib/Target/X86/X86FastPreTileConfig.cpp13
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp3
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp348
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h6
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp4
-rw-r--r--llvm/lib/Target/X86/X86InstrAMX.td14
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX10.td131
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td42
-rw-r--r--llvm/lib/Target/X86/X86InstrCompiler.td12
-rw-r--r--llvm/lib/Target/X86/X86InstrControl.td7
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp50
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h4
-rw-r--r--llvm/lib/Target/X86/X86InstrPredicates.td10
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td12
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp6
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp53
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.h5
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.td5
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver3.td20
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver4.td58
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp20
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h6
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp6
-rw-r--r--llvm/lib/Target/X86/X86WinEHUnwindV2.cpp46
-rw-r--r--llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp325
-rw-r--r--llvm/lib/Target/XCore/XCoreInstrInfo.cpp8
-rw-r--r--llvm/lib/Target/XCore/XCoreInstrInfo.h3
-rw-r--r--llvm/lib/Target/XCore/XCoreSubtarget.cpp4
-rw-r--r--llvm/lib/Target/Xtensa/XtensaISelLowering.cpp16
-rw-r--r--llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp2
388 files changed, 15360 insertions, 11011 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c52487ab8a79..c31a090bba77 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -307,6 +307,7 @@ private:
/// Emit instruction to set float register to zero.
void emitFMov0(const MachineInstr &MI);
+ void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg);
using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
@@ -734,7 +735,7 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) {
const Triple &TT = TM.getTargetTriple();
assert(TT.isOSBinFormatELF());
std::unique_ptr<MCSubtargetInfo> STI(
- TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
+ TM.getTarget().createMCSubtargetInfo(TT, "", ""));
assert(STI && "Unable to create subtarget info");
this->STI = static_cast<const AArch64Subtarget *>(&*STI);
@@ -1829,45 +1830,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
- if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
- STI->isNeonAvailable()) {
- // Convert H/S register to corresponding D register
- if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
- DestReg = AArch64::D0 + (DestReg - AArch64::H0);
- else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
- DestReg = AArch64::D0 + (DestReg - AArch64::S0);
- else
- assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+ if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
+ if (STI->hasZeroCycleZeroingFPR64()) {
+ // Convert H/S register to corresponding D register
+ const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+ if (AArch64::FPR16RegClass.contains(DestReg))
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ else if (AArch64::FPR32RegClass.contains(DestReg))
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ else
+ assert(AArch64::FPR64RegClass.contains(DestReg));
+
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVID);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else if (STI->hasZeroCycleZeroingFPR128()) {
+ // Convert H/S/D register to corresponding Q register
+ const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+ if (AArch64::FPR16RegClass.contains(DestReg)) {
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ } else if (AArch64::FPR32RegClass.contains(DestReg)) {
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ } else {
+ assert(AArch64::FPR64RegClass.contains(DestReg));
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ }
- MCInst MOVI;
- MOVI.setOpcode(AArch64::MOVID);
- MOVI.addOperand(MCOperand::createReg(DestReg));
- MOVI.addOperand(MCOperand::createImm(0));
- EmitToStreamer(*OutStreamer, MOVI);
- } else {
- MCInst FMov;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("Unexpected opcode");
- case AArch64::FMOVH0:
- FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
- if (!STI->hasFullFP16())
- DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::WZR));
- break;
- case AArch64::FMOVS0:
- FMov.setOpcode(AArch64::FMOVWSr);
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::WZR));
- break;
- case AArch64::FMOVD0:
- FMov.setOpcode(AArch64::FMOVXDr);
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::XZR));
- break;
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVIv2d_ns);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else {
+ emitFMov0AsFMov(MI, DestReg);
}
- EmitToStreamer(*OutStreamer, FMov);
+ } else {
+ emitFMov0AsFMov(MI, DestReg);
+ }
+}
+
+void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI,
+ Register DestReg) {
+ MCInst FMov;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case AArch64::FMOVH0:
+ FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
+ if (!STI->hasFullFP16())
+ DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVS0:
+ FMov.setOpcode(AArch64::FMOVWSr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVD0:
+ FMov.setOpcode(AArch64::FMOVXDr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+ break;
}
+ EmitToStreamer(*OutStreamer, FMov);
}
Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
@@ -2229,13 +2262,24 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
if (BrTarget == AddrDisc)
report_fatal_error("Branch target is signed with its own value");
- // If we are printing BLRA pseudo instruction, then x16 and x17 are
- // implicit-def'ed by the MI and AddrDisc is not used as any other input, so
- // try to save one MOV by setting MayUseAddrAsScratch.
+ // If we are printing BLRA pseudo, try to save one MOV by making use of the
+ // fact that x16 and x17 are described as clobbered by the MI instruction and
+ // AddrDisc is not used as any other input.
+ //
+ // Back in the day, emitPtrauthDiscriminator was restricted to only returning
+ // either x16 or x17, meaning the returned register is always among the
+ // implicit-def'ed registers of BLRA pseudo. Now this property can be violated
+ // if isX16X17Safer predicate is false, thus manually check if AddrDisc is
+ // among x16 and x17 to prevent clobbering unexpected registers.
+ //
// Unlike BLRA, BRA pseudo is used to perform computed goto, and thus not
// declared as clobbering x16/x17.
+ //
+ // FIXME: Make use of `killed` flags and register masks instead.
+ bool AddrDiscIsImplicitDef =
+ IsCall && (AddrDisc == AArch64::X16 || AddrDisc == AArch64::X17);
Register DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc, AArch64::X17,
- /*MayUseAddrAsScratch=*/IsCall);
+ AddrDiscIsImplicitDef);
bool IsZeroDisc = DiscReg == AArch64::XZR;
unsigned Opc;
@@ -2862,7 +2906,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
MCInst TmpInst;
TmpInst.setOpcode(AArch64::MOVIv16b_ns);
TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
- TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm()));
+ TmpInst.addOperand(MCOperand::createImm(0));
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
@@ -2968,8 +3012,15 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
// See the comments in emitPtrauthBranch.
if (Callee == AddrDisc)
report_fatal_error("Call target is signed with its own value");
+
+ // After isX16X17Safer predicate was introduced, emitPtrauthDiscriminator is
+ // no longer restricted to only reusing AddrDisc when it is X16 or X17
+ // (which are implicit-def'ed by AUTH_TCRETURN pseudos), thus impose this
+ // restriction manually not to clobber an unexpected register.
+ bool AddrDiscIsImplicitDef =
+ AddrDisc == AArch64::X16 || AddrDisc == AArch64::X17;
Register DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc, ScratchReg,
- /*MayUseAddrAsScratch=*/true);
+ AddrDiscIsImplicitDef);
const bool IsZero = DiscReg == AArch64::XZR;
const unsigned Opcodes[2][2] = {{AArch64::BRAA, AArch64::BRAAZ},
diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index 3436dc9ef452..137ff898e86a 100644
--- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -30,6 +30,14 @@ using namespace llvm;
#define AARCH64_BRANCH_TARGETS_NAME "AArch64 Branch Targets"
namespace {
+// BTI HINT encoding: base (32) plus 'c' (2) and/or 'j' (4).
+enum : unsigned {
+ BTIBase = 32, // Base immediate for BTI HINT
+ BTIC = 1u << 1, // 2
+ BTIJ = 1u << 2, // 4
+ BTIMask = BTIC | BTIJ,
+};
+
class AArch64BranchTargets : public MachineFunctionPass {
public:
static char ID;
@@ -42,6 +50,7 @@ private:
void addBTI(MachineBasicBlock &MBB, bool CouldCall, bool CouldJump,
bool NeedsWinCFI);
};
+
} // end anonymous namespace
char AArch64BranchTargets::ID = 0;
@@ -62,9 +71,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
if (!MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
return false;
- LLVM_DEBUG(
- dbgs() << "********** AArch64 Branch Targets **********\n"
- << "********** Function: " << MF.getName() << '\n');
+ LLVM_DEBUG(dbgs() << "********** AArch64 Branch Targets **********\n"
+ << "********** Function: " << MF.getName() << '\n');
const Function &F = MF.getFunction();
// LLVM does not consider basic blocks which are the targets of jump tables
@@ -103,6 +111,12 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
JumpTableTargets.count(&MBB))
CouldJump = true;
+ if (MBB.isEHPad()) {
+ if (HasWinCFI && (MBB.isEHFuncletEntry() || MBB.isCleanupFuncletEntry()))
+ CouldCall = true;
+ else
+ CouldJump = true;
+ }
if (CouldCall || CouldJump) {
addBTI(MBB, CouldCall, CouldJump, HasWinCFI);
MadeChange = true;
@@ -130,7 +144,12 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
auto MBBI = MBB.begin();
- // Skip the meta instructions, those will be removed anyway.
+ // If the block starts with EH_LABEL(s), skip them first.
+ while (MBBI != MBB.end() && MBBI->isEHLabel()) {
+ ++MBBI;
+ }
+
+ // Skip meta/CFI/etc. (and EMITBKEY) to reach the first executable insn.
for (; MBBI != MBB.end() &&
(MBBI->isMetaInstruction() || MBBI->getOpcode() == AArch64::EMITBKEY);
++MBBI)
@@ -138,16 +157,21 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
// SCTLR_EL1.BT[01] is set to 0 by default which means
// PACI[AB]SP are implicitly BTI C so no BTI C instruction is needed there.
- if (MBBI != MBB.end() && HintNum == 34 &&
+ if (MBBI != MBB.end() && ((HintNum & BTIMask) == BTIC) &&
(MBBI->getOpcode() == AArch64::PACIASP ||
MBBI->getOpcode() == AArch64::PACIBSP))
return;
- if (HasWinCFI && MBBI->getFlag(MachineInstr::FrameSetup)) {
- BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
- TII->get(AArch64::SEH_Nop));
+ // Insert BTI exactly at the first executable instruction.
+ const DebugLoc DL = MBB.findDebugLoc(MBBI);
+ MachineInstr *BTI = BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT))
+ .addImm(HintNum)
+ .getInstr();
+
+ // WinEH: put .seh_nop after BTI when the first real insn is FrameSetup.
+ if (HasWinCFI && MBBI != MBB.end() &&
+ MBBI->getFlag(MachineInstr::FrameSetup)) {
+ auto AfterBTI = std::next(MachineBasicBlock::iterator(BTI));
+ BuildMI(MBB, AfterBTI, DL, TII->get(AArch64::SEH_Nop));
}
- BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
- TII->get(AArch64::HINT))
- .addImm(HintNum);
}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 5f499e5e9700..076a6235eef0 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -369,5 +369,5 @@ def AArch64PostLegalizerCombiner
commute_constant_to_rhs, extract_vec_elt_combines,
push_freeze_to_prevent_poison_from_propagating,
combine_mul_cmlt, combine_use_vector_truncate,
- extmultomull, truncsat_combines]> {
+ extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> {
}
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 57dcd68595ff..79655e1c9529 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
}
case AArch64::InOutZAUsePseudo:
case AArch64::RequiresZASavePseudo:
+ case AArch64::SMEStateAllocPseudo:
case AArch64::COALESCER_BARRIER_FPR16:
case AArch64::COALESCER_BARRIER_FPR32:
case AArch64::COALESCER_BARRIER_FPR64:
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index c1c1f0a1024d..46f5f0c1ca9d 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -621,25 +621,30 @@ def FeatureZCRegMoveGPR64 : SubtargetFeature<"zcm-gpr64", "HasZeroCycleRegMoveGP
def FeatureZCRegMoveGPR32 : SubtargetFeature<"zcm-gpr32", "HasZeroCycleRegMoveGPR32", "true",
"Has zero-cycle register moves for GPR32 registers">;
+def FeatureZCRegMoveFPR128 : SubtargetFeature<"zcm-fpr128", "HasZeroCycleRegMoveFPR128", "true",
+ "Has zero-cycle register moves for FPR128 registers">;
+
def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true",
"Has zero-cycle register moves for FPR64 registers">;
def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
"Has zero-cycle register moves for FPR32 registers">;
-def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
- "Has zero-cycle zeroing instructions for generic registers">;
+def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true",
+ "Has zero-cycle zeroing instructions for GPR64 registers">;
+
+def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
+ "Has zero-cycle zeroing instructions for GPR32 registers">;
+
+def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true",
+ "Has zero-cycle zeroing instructions for FPR128 registers">;
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
// as movi is more efficient across all cores. Newer cores can eliminate
// fmovs early and there is no difference with movi, but this not true for
// all implementations.
-def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
- "Has no zero-cycle zeroing instructions for FP registers">;
-
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
- "Has zero-cycle zeroing instructions",
- [FeatureZCZeroingGP]>;
+def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false",
+ "Has no zero-cycle zeroing instructions for FPR64 registers">;
/// ... but the floating-point version doesn't quite work in rare cases on older
/// CPUs.
@@ -730,9 +735,13 @@ def FeatureFuseArithmeticLogic : SubtargetFeature<
"fuse-arith-logic", "HasFuseArithmeticLogic", "true",
"CPU fuses arithmetic and logic operations">;
-def FeatureFuseCCSelect : SubtargetFeature<
- "fuse-csel", "HasFuseCCSelect", "true",
- "CPU fuses conditional select operations">;
+def FeatureFuseCmpCSel : SubtargetFeature<
+ "fuse-csel", "HasFuseCmpCSel", "true",
+ "CPU can fuse CMP and CSEL operations">;
+
+def FeatureFuseCmpCSet : SubtargetFeature<
+ "fuse-cset", "HasFuseCmpCSet", "true",
+ "CPU can fuse CMP and CSET operations">;
def FeatureFuseCryptoEOR : SubtargetFeature<
"fuse-crypto-eor", "HasFuseCryptoEOR", "true",
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 7725fa4f1ccb..175b5e04d82f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -211,6 +211,7 @@
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64PrologueEpilogue.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
@@ -218,7 +219,6 @@
#include "Utils/AArch64SMEAttributes.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -293,8 +293,6 @@ static cl::opt<bool> DisableMultiVectorSpillFill(
cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(false),
cl::Hidden);
-STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
-
/// Returns how much of the incoming argument stack area (in bytes) we should
/// clean up in an epilogue. For the C calling convention this will be 0, for
/// guaranteed tail call conventions it can be positive (a normal return or a
@@ -328,23 +326,20 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
return ArgumentPopSize;
}
-static bool produceCompactUnwindFrame(MachineFunction &MF);
-static bool needsWinCFI(const MachineFunction &MF);
-static StackOffset getSVEStackSize(const MachineFunction &MF);
-static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
- bool HasCall = false);
-static bool requiresSaveVG(const MachineFunction &MF);
+static bool produceCompactUnwindFrame(const AArch64FrameLowering &,
+ MachineFunction &MF);
// Conservatively, returns true if the function is likely to have an SVE vectors
// on the stack. This function is safe to be called before callee-saves or
// object offsets have been determined.
-static bool isLikelyToHaveSVEStack(const MachineFunction &MF) {
+static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
+ const MachineFunction &MF) {
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
if (AFI->isSVECC())
return true;
if (AFI->hasCalculatedStackSizeSVE())
- return bool(getSVEStackSize(MF));
+ return bool(AFL.getSVEStackSize(MF));
const MachineFrameInfo &MFI = MF.getFrameInfo();
for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) {
@@ -372,7 +367,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
// TODO: SVE is not supported yet.
- if (isLikelyToHaveSVEStack(MF))
+ if (isLikelyToHaveSVEStack(*this, MF))
return false;
// Bail on stack adjustment needed on return for simplicity.
@@ -409,7 +404,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
/// Returns true if CSRs should be paired.
bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
- return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
+ return produceCompactUnwindFrame(*this, MF) || homogeneousPrologEpilog(MF);
}
/// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -451,11 +446,10 @@ AArch64FrameLowering::getStackIDForScalableVectors() const {
return TargetStackID::ScalableVector;
}
-/// Returns the size of the fixed object area (allocated next to sp on entry)
-/// On Win64 this may include a var args area and an UnwindHelp object for EH.
-static unsigned getFixedObjectSize(const MachineFunction &MF,
- const AArch64FunctionInfo *AFI, bool IsWin64,
- bool IsFunclet) {
+unsigned
+AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF,
+ const AArch64FunctionInfo *AFI,
+ bool IsWin64, bool IsFunclet) const {
assert(AFI->getTailCallReservedStack() % 16 == 0 &&
"Tail call reserved stack must be aligned to 16 bytes");
if (!IsWin64 || IsFunclet) {
@@ -494,7 +488,8 @@ static unsigned getFixedObjectSize(const MachineFunction &MF,
}
/// Returns the size of the entire SVE stackframe (calleesaves + spills).
-static StackOffset getSVEStackSize(const MachineFunction &MF) {
+StackOffset
+AArch64FrameLowering::getSVEStackSize(const MachineFunction &MF) const {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
}
@@ -683,70 +678,6 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
return MBB.erase(I);
}
-void AArch64FrameLowering::emitCalleeSavedGPRLocations(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
- MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo &MFI = MF.getFrameInfo();
-
- const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
- if (CSI.empty())
- return;
-
- CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
- for (const auto &Info : CSI) {
- unsigned FrameIdx = Info.getFrameIdx();
- if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
- continue;
-
- assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
- int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
- CFIBuilder.buildOffset(Info.getReg(), Offset);
- }
-}
-
-void AArch64FrameLowering::emitCalleeSavedSVELocations(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
- MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo &MFI = MF.getFrameInfo();
-
- // Add callee saved registers to move list.
- const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
- if (CSI.empty())
- return;
-
- const TargetSubtargetInfo &STI = MF.getSubtarget();
- const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
- AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
- CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
-
- std::optional<int64_t> IncomingVGOffsetFromDefCFA;
- if (requiresSaveVG(MF)) {
- auto IncomingVG = *find_if(
- reverse(CSI), [](auto &Info) { return Info.getReg() == AArch64::VG; });
- IncomingVGOffsetFromDefCFA =
- MFI.getObjectOffset(IncomingVG.getFrameIdx()) - getOffsetOfLocalArea();
- }
-
- for (const auto &Info : CSI) {
- if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector)
- continue;
-
- // Not all unwinders may know about SVE registers, so assume the lowest
- // common denominator.
- assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
- MCRegister Reg = Info.getReg();
- if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
- continue;
-
- StackOffset Offset =
- StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
- StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
-
- CFIBuilder.insertCFIInst(
- createCFAOffset(TRI, Reg, Offset, IncomingVGOffsetFromDefCFA));
- }
-}
-
void AArch64FrameLowering::resetCFIToInitialState(
MachineBasicBlock &MBB) const {
@@ -1088,8 +1019,8 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
}
}
-static bool windowsRequiresStackProbe(const MachineFunction &MF,
- uint64_t StackSizeInBytes) {
+bool AArch64FrameLowering::windowsRequiresStackProbe(
+ const MachineFunction &MF, uint64_t StackSizeInBytes) const {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
// TODO: When implementing stack protectors, take that into account
@@ -1108,19 +1039,9 @@ static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
LiveRegs.addReg(CSRegs[i]);
}
-// Find a scratch register that we can use at the start of the prologue to
-// re-align the stack pointer. We avoid using callee-save registers since they
-// may appear to be free when this is called from canUseAsPrologue (during
-// shrink wrapping), but then no longer be free when this is called from
-// emitPrologue.
-//
-// FIXME: This is a bit conservative, since in the above case we could use one
-// of the callee-save registers as a scratch temp to re-align the stack pointer,
-// but we would then have to make sure that we were in fact saving at least one
-// callee-save register in the prologue, which is additional complexity that
-// doesn't seem worth the benefit.
-static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
- bool HasCall) {
+Register
+AArch64FrameLowering::findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
+ bool HasCall) const {
MachineFunction *MF = MBB->getParent();
// If MBB is an entry block, use X9 as the scratch register
@@ -1193,13 +1114,14 @@ bool AArch64FrameLowering::canUseAsPrologue(
return true;
}
-static bool needsWinCFI(const MachineFunction &MF) {
+bool AArch64FrameLowering::needsWinCFI(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
F.needsUnwindTableEntry();
}
-static bool shouldSignReturnAddressEverywhere(const MachineFunction &MF) {
+bool AArch64FrameLowering::shouldSignReturnAddressEverywhere(
+ const MachineFunction &MF) const {
// FIXME: With WinCFI, extra care should be taken to place SEH_PACSignLR
// and SEH_EpilogEnd instructions in the correct order.
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
@@ -1475,13 +1397,13 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
}
-bool requiresGetVGCall(MachineFunction &MF) {
- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+bool AArch64FrameLowering::requiresGetVGCall(const MachineFunction &MF) const {
+ auto *AFI = MF.getInfo<AArch64FunctionInfo>();
return AFI->hasStreamingModeChanges() &&
!MF.getSubtarget<AArch64Subtarget>().hasSVE();
}
-static bool requiresSaveVG(const MachineFunction &MF) {
+bool AArch64FrameLowering::requiresSaveVG(const MachineFunction &MF) const {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if (!AFI->needsDwarfUnwindInfo(MF) || !AFI->hasStreamingModeChanges())
return false;
@@ -1499,8 +1421,8 @@ static bool matchLibcall(const TargetLowering &TLI, const MachineOperand &MO,
StringRef(TLI.getLibcallName(LC)) == MO.getSymbolName();
}
-bool isVGInstruction(MachineBasicBlock::iterator MBBI,
- const TargetLowering &TLI) {
+bool AArch64FrameLowering::isVGInstruction(MachineBasicBlock::iterator MBBI,
+ const TargetLowering &TLI) const {
unsigned Opc = MBBI->getOpcode();
if (Opc == AArch64::CNTD_XPiI)
return true;
@@ -1514,15 +1436,12 @@ bool isVGInstruction(MachineBasicBlock::iterator MBBI,
return Opc == TargetOpcode::COPY;
}
-// Convert callee-save register save/restore instruction to do stack pointer
-// decrement/increment to allocate/deallocate the callee-save stack area by
-// converting store/load to use pre/post increment version.
-static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+MachineBasicBlock::iterator
+AArch64FrameLowering::convertCalleeSaveRestoreToSPPrePostIncDec(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
- MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
- int CFAOffset = 0) {
+ MachineInstr::MIFlag FrameFlag, int CFAOffset) const {
unsigned NewOpc;
// If the function contains streaming mode changes, we expect instructions
@@ -1643,12 +1562,9 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
return std::prev(MBB.erase(MBBI));
}
-// Fixup callee-save register save/restore instructions to take into account
-// combined SP bump by adding the local stack size to the stack offsets.
-static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
- uint64_t LocalStackSize,
- bool NeedsWinCFI,
- bool *HasWinCFI) {
+void AArch64FrameLowering::fixupCalleeSaveRestoreStackOffset(
+ MachineInstr &MI, uint64_t LocalStackSize, bool NeedsWinCFI,
+ bool *HasWinCFI) const {
if (AArch64InstrInfo::isSEHInstruction(MI))
return;
@@ -1703,7 +1619,8 @@ static unsigned getStackHazardSize(const MachineFunction &MF) {
}
// Convenience function to determine whether I is an SVE callee save.
-static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
+bool AArch64FrameLowering::isSVECalleeSave(
+ MachineBasicBlock::iterator I) const {
switch (I->getOpcode()) {
default:
return false;
@@ -1725,42 +1642,6 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
}
}
-static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
- MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, bool NeedsWinCFI,
- bool NeedsUnwindInfo) {
- // Shadow call stack prolog: str x30, [x18], #8
- BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
- .addReg(AArch64::X18, RegState::Define)
- .addReg(AArch64::LR)
- .addReg(AArch64::X18)
- .addImm(8)
- .setMIFlag(MachineInstr::FrameSetup);
-
- // This instruction also makes x18 live-in to the entry block.
- MBB.addLiveIn(AArch64::X18);
-
- if (NeedsWinCFI)
- BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
- .setMIFlag(MachineInstr::FrameSetup);
-
- if (NeedsUnwindInfo) {
- // Emit a CFI instruction that causes 8 to be subtracted from the value of
- // x18 when unwinding past this frame.
- static const char CFIInst[] = {
- dwarf::DW_CFA_val_expression,
- 18, // register
- 2, // length
- static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
- static_cast<char>(-8) & 0x7f, // addend (sleb128)
- };
- CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
- .buildEscape(StringRef(CFIInst, sizeof(CFIInst)));
- }
-}
-
static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
MachineFunction &MF,
MachineBasicBlock &MBB,
@@ -1783,36 +1664,6 @@ static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
.buildRestore(AArch64::X18);
}
-// Define the current CFA rule to use the provided FP.
-static void emitDefineCFAWithFP(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned FixedObject) {
- const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
- const AArch64RegisterInfo *TRI = STI.getRegisterInfo();
- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-
- const int OffsetToFirstCalleeSaveFromFP =
- AFI->getCalleeSaveBaseToFrameRecordOffset() -
- AFI->getCalleeSavedStackSize();
- Register FramePtr = TRI->getFrameRegister(MF);
- CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
- .buildDefCFA(FramePtr, FixedObject - OffsetToFirstCalleeSaveFromFP);
-}
-
-#ifndef NDEBUG
-/// Collect live registers from the end of \p MI's parent up to (including) \p
-/// MI in \p LiveRegs.
-static void getLivePhysRegsUpTo(MachineInstr &MI, const TargetRegisterInfo &TRI,
- LivePhysRegs &LiveRegs) {
-
- MachineBasicBlock &MBB = *MI.getParent();
- LiveRegs.addLiveOuts(MBB);
- for (const MachineInstr &MI :
- reverse(make_range(MI.getIterator(), MBB.instr_end())))
- LiveRegs.stepBackward(MI);
-}
-#endif
-
void AArch64FrameLowering::emitPacRetPlusLeafHardening(
MachineFunction &MF) const {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
@@ -1848,616 +1699,8 @@ void AArch64FrameLowering::emitPacRetPlusLeafHardening(
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- MachineBasicBlock::iterator MBBI = MBB.begin();
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- const Function &F = MF.getFunction();
- const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
- const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-
- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- bool EmitCFI = AFI->needsDwarfUnwindInfo(MF);
- bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
- bool HasFP = hasFP(MF);
- bool NeedsWinCFI = needsWinCFI(MF);
- bool HasWinCFI = false;
- auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
-
- MachineBasicBlock::iterator End = MBB.end();
-#ifndef NDEBUG
- const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
- // Collect live register from the end of MBB up to the start of the existing
- // frame setup instructions.
- MachineBasicBlock::iterator NonFrameStart = MBB.begin();
- while (NonFrameStart != End &&
- NonFrameStart->getFlag(MachineInstr::FrameSetup))
- ++NonFrameStart;
-
- LivePhysRegs LiveRegs(*TRI);
- if (NonFrameStart != MBB.end()) {
- getLivePhysRegsUpTo(*NonFrameStart, *TRI, LiveRegs);
- // Ignore registers used for stack management for now.
- LiveRegs.removeReg(AArch64::SP);
- LiveRegs.removeReg(AArch64::X19);
- LiveRegs.removeReg(AArch64::FP);
- LiveRegs.removeReg(AArch64::LR);
-
- // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
- // This is necessary to spill VG if required where SVE is unavailable, but
- // X0 is preserved around this call.
- if (requiresGetVGCall(MF))
- LiveRegs.removeReg(AArch64::X0);
- }
-
- auto VerifyClobberOnExit = make_scope_exit([&]() {
- if (NonFrameStart == MBB.end())
- return;
- // Check if any of the newly instructions clobber any of the live registers.
- for (MachineInstr &MI :
- make_range(MBB.instr_begin(), NonFrameStart->getIterator())) {
- for (auto &Op : MI.operands())
- if (Op.isReg() && Op.isDef())
- assert(!LiveRegs.contains(Op.getReg()) &&
- "live register clobbered by inserted prologue instructions");
- }
- });
-#endif
-
- bool IsFunclet = MBB.isEHFuncletEntry();
-
- // At this point, we're going to decide whether or not the function uses a
- // redzone. In most cases, the function doesn't have a redzone so let's
- // assume that's false and set it to true in the case that there's a redzone.
- AFI->setHasRedZone(false);
-
- // Debug location must be unknown since the first debug location is used
- // to determine the end of the prologue.
- DebugLoc DL;
-
- const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
- if (MFnI.shouldSignReturnAddress(MF)) {
- // If pac-ret+leaf is in effect, PAUTH_PROLOGUE pseudo instructions
- // are inserted by emitPacRetPlusLeafHardening().
- if (!shouldSignReturnAddressEverywhere(MF)) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE))
- .setMIFlag(MachineInstr::FrameSetup);
- }
- // AArch64PointerAuth pass will insert SEH_PACSignLR
- HasWinCFI |= NeedsWinCFI;
- }
-
- if (MFnI.needsShadowCallStackPrologueEpilogue(MF)) {
- emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
- MFnI.needsDwarfUnwindInfo(MF));
- HasWinCFI |= NeedsWinCFI;
- }
-
- if (EmitCFI && MFnI.isMTETagged()) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- // We signal the presence of a Swift extended frame to external tools by
- // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
- // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
- // bits so that is still true.
- if (HasFP && AFI->hasSwiftAsyncContext()) {
- switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
- case SwiftAsyncFramePointerMode::DeploymentBased:
- if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
- // The special symbol below is absolute and has a *value* that can be
- // combined with the frame pointer to signal an extended frame.
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
- .addExternalSymbol("swift_async_extendedFramePointerFlags",
- AArch64II::MO_GOT);
- if (NeedsWinCFI) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlags(MachineInstr::FrameSetup);
- HasWinCFI = true;
- }
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
- .addUse(AArch64::FP)
- .addUse(AArch64::X16)
- .addImm(Subtarget.isTargetILP32() ? 32 : 0);
- if (NeedsWinCFI) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlags(MachineInstr::FrameSetup);
- HasWinCFI = true;
- }
- break;
- }
- [[fallthrough]];
-
- case SwiftAsyncFramePointerMode::Always:
- // ORR x29, x29, #0x1000_0000_0000_0000
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
- .addUse(AArch64::FP)
- .addImm(0x1100)
- .setMIFlag(MachineInstr::FrameSetup);
- if (NeedsWinCFI) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlags(MachineInstr::FrameSetup);
- HasWinCFI = true;
- }
- break;
-
- case SwiftAsyncFramePointerMode::Never:
- break;
- }
- }
-
- // All calls are tail calls in GHC calling conv, and functions have no
- // prologue/epilogue.
- if (MF.getFunction().getCallingConv() == CallingConv::GHC)
- return;
-
- // Set tagged base pointer to the requested stack slot.
- // Ideally it should match SP value after prologue.
- std::optional<int> TBPI = AFI->getTaggedBasePointerIndex();
- if (TBPI)
- AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
- else
- AFI->setTaggedBasePointerOffset(MFI.getStackSize());
-
- const StackOffset &SVEStackSize = getSVEStackSize(MF);
-
- // getStackSize() includes all the locals in its size calculation. We don't
- // include these locals when computing the stack size of a funclet, as they
- // are allocated in the parent's stack frame and accessed via the frame
- // pointer from the funclet. We only save the callee saved registers in the
- // funclet, which are really the callee saved registers of the parent
- // function, including the funclet.
- int64_t NumBytes =
- IsFunclet ? getWinEHFuncletFrameSize(MF) : MFI.getStackSize();
- if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
- assert(!HasFP && "unexpected function without stack frame but with FP");
- assert(!SVEStackSize &&
- "unexpected function without stack frame but with SVE objects");
- // All of the stack allocation is for locals.
- AFI->setLocalStackSize(NumBytes);
- if (!NumBytes) {
- if (NeedsWinCFI && HasWinCFI) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
- .setMIFlag(MachineInstr::FrameSetup);
- }
- return;
- }
- // REDZONE: If the stack size is less than 128 bytes, we don't need
- // to actually allocate.
- if (canUseRedZone(MF)) {
- AFI->setHasRedZone(true);
- ++NumRedZoneFunctions;
- } else {
- emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(-NumBytes), TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
- if (EmitCFI) {
- // Label used to tie together the PROLOG_LABEL and the MachineMoves.
- MCSymbol *FrameLabel = MF.getContext().createTempSymbol();
- // Encode the stack size of the leaf function.
- CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
- .buildDefCFAOffset(NumBytes, FrameLabel);
- }
- }
-
- if (NeedsWinCFI) {
- HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- return;
- }
-
- bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
- unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
-
- // Windows unwind can't represent the required stack adjustments if we have
- // both SVE callee-saves and dynamic stack allocations, and the frame
- // pointer is before the SVE spills. The allocation of the frame pointer
- // must be the last instruction in the prologue so the unwinder can restore
- // the stack pointer correctly. (And there isn't any unwind opcode for
- // `addvl sp, x29, -17`.)
- //
- // Because of this, we do spills in the opposite order on Windows: first SVE,
- // then GPRs. The main side-effect of this is that it makes accessing
- // parameters passed on the stack more expensive.
- //
- // We could consider rearranging the spills for simpler cases.
- bool FPAfterSVECalleeSaves =
- Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
- if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
- reportFatalUsageError("SME hazard padding is not supported on Windows");
-
- auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
- // All of the remaining stack allocations are for locals.
- AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
- bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
- bool HomPrologEpilog = homogeneousPrologEpilog(MF);
- if (FPAfterSVECalleeSaves) {
- // If we're doing SVE saves first, we need to immediately allocate space
- // for fixed objects, then space for the SVE callee saves.
- //
- // Windows unwind requires that the scalable size is a multiple of 16;
- // that's handled when the callee-saved size is computed.
- auto SaveSize =
- StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) +
- StackOffset::getFixed(FixedObject);
- allocateStackSpace(MBB, MBBI, 0, SaveSize, NeedsWinCFI, &HasWinCFI,
- /*EmitCFI=*/false, StackOffset{},
- /*FollowupAllocs=*/true);
- NumBytes -= FixedObject;
-
- // Now allocate space for the GPR callee saves.
- while (MBBI != End && IsSVECalleeSave(MBBI))
- ++MBBI;
- MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
- MBB, MBBI, DL, TII, -AFI->getCalleeSavedStackSize(), NeedsWinCFI,
- &HasWinCFI, EmitAsyncCFI);
- NumBytes -= AFI->getCalleeSavedStackSize();
- } else if (CombineSPBump) {
- assert(!SVEStackSize && "Cannot combine SP bump with SVE");
- emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(-NumBytes), TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
- EmitAsyncCFI);
- NumBytes = 0;
- } else if (HomPrologEpilog) {
- // Stack has been already adjusted.
- NumBytes -= PrologueSaveSize;
- } else if (PrologueSaveSize != 0) {
- MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
- MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
- EmitAsyncCFI);
- NumBytes -= PrologueSaveSize;
- }
- assert(NumBytes >= 0 && "Negative stack allocation size!?");
-
- // Move past the saves of the callee-saved registers, fixing up the offsets
- // and pre-inc if we decided to combine the callee-save and local stack
- // pointer bump above.
- auto &TLI = *MF.getSubtarget().getTargetLowering();
- while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
- !IsSVECalleeSave(MBBI)) {
- if (CombineSPBump &&
- // Only fix-up frame-setup load/store instructions.
- (!requiresSaveVG(MF) || !isVGInstruction(MBBI, TLI)))
- fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
- NeedsWinCFI, &HasWinCFI);
- ++MBBI;
- }
-
- // For funclets the FP belongs to the containing function.
- if (!IsFunclet && HasFP) {
- // Only set up FP if we actually need to.
- int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
-
- if (CombineSPBump)
- FPOffset += AFI->getLocalStackSize();
-
- if (AFI->hasSwiftAsyncContext()) {
- // Before we update the live FP we have to ensure there's a valid (or
- // null) asynchronous context in its slot just before FP in the frame
- // record, so store it now.
- const auto &Attrs = MF.getFunction().getAttributes();
- bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
- if (HaveInitialContext)
- MBB.addLiveIn(AArch64::X22);
- Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
- .addUse(Reg)
- .addUse(AArch64::SP)
- .addImm(FPOffset - 8)
- .setMIFlags(MachineInstr::FrameSetup);
- if (NeedsWinCFI) {
- // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded
- // to multiple instructions, should be mutually-exclusive.
- assert(Subtarget.getTargetTriple().getArchName() != "arm64e");
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlags(MachineInstr::FrameSetup);
- HasWinCFI = true;
- }
- }
-
- if (HomPrologEpilog) {
- auto Prolog = MBBI;
- --Prolog;
- assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
- Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
- } else {
- // Issue sub fp, sp, FPOffset or
- // mov fp,sp when FPOffset is zero.
- // Note: All stores of callee-saved registers are marked as "FrameSetup".
- // This code marks the instruction(s) that set the FP also.
- emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
- StackOffset::getFixed(FPOffset), TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
- if (NeedsWinCFI && HasWinCFI) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
- .setMIFlag(MachineInstr::FrameSetup);
- // After setting up the FP, the rest of the prolog doesn't need to be
- // included in the SEH unwind info.
- NeedsWinCFI = false;
- }
- }
- if (EmitAsyncCFI)
- emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject);
- }
-
- // Now emit the moves for whatever callee saved regs we have (including FP,
- // LR if those are saved). Frame instructions for SVE register are emitted
- // later, after the instruction which actually save SVE regs.
- if (EmitAsyncCFI)
- emitCalleeSavedGPRLocations(MBB, MBBI);
-
- // Alignment is required for the parent frame, not the funclet
- const bool NeedsRealignment =
- NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
- const int64_t RealignmentPadding =
- (NeedsRealignment && MFI.getMaxAlign() > Align(16))
- ? MFI.getMaxAlign().value() - 16
- : 0;
-
- if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
- if (AFI->getSVECalleeSavedStackSize())
- report_fatal_error(
- "SVE callee saves not yet supported with stack probing");
-
- // Find an available register to spill the value of X15 to, if X15 is being
- // used already for nest.
- unsigned X15Scratch = AArch64::NoRegister;
- const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
- if (llvm::any_of(MBB.liveins(),
- [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
- return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
- AArch64::X15, LiveIn.PhysReg);
- })) {
- X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true);
- assert(X15Scratch != AArch64::NoRegister &&
- (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
-#ifndef NDEBUG
- LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
-#endif
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
- .addReg(AArch64::XZR)
- .addReg(AArch64::X15, RegState::Undef)
- .addReg(AArch64::X15, RegState::Implicit)
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
- if (NeedsWinCFI) {
- HasWinCFI = true;
- // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
- // exceed this amount. We need to move at most 2^24 - 1 into x15.
- // This is at most two instructions, MOVZ followed by MOVK.
- // TODO: Fix to use multiple stack alloc unwind codes for stacks
- // exceeding 256MB in size.
- if (NumBytes >= (1 << 28))
- report_fatal_error("Stack size cannot exceed 256MB for stack "
- "unwinding purposes");
-
- uint32_t LowNumWords = NumWords & 0xFFFF;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
- .addImm(LowNumWords)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
- .setMIFlag(MachineInstr::FrameSetup);
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlag(MachineInstr::FrameSetup);
- if ((NumWords & 0xFFFF0000) != 0) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
- .addReg(AArch64::X15)
- .addImm((NumWords & 0xFFFF0000) >> 16) // High half
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
- .setMIFlag(MachineInstr::FrameSetup);
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlag(MachineInstr::FrameSetup);
- }
- } else {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
- .addImm(NumWords)
- .setMIFlags(MachineInstr::FrameSetup);
- }
-
- const char *ChkStk = Subtarget.getChkStkName();
- switch (MF.getTarget().getCodeModel()) {
- case CodeModel::Tiny:
- case CodeModel::Small:
- case CodeModel::Medium:
- case CodeModel::Kernel:
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
- .addExternalSymbol(ChkStk)
- .addReg(AArch64::X15, RegState::Implicit)
- .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
- .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
- .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
- .setMIFlags(MachineInstr::FrameSetup);
- if (NeedsWinCFI) {
- HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlag(MachineInstr::FrameSetup);
- }
- break;
- case CodeModel::Large:
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
- .addReg(AArch64::X16, RegState::Define)
- .addExternalSymbol(ChkStk)
- .addExternalSymbol(ChkStk)
- .setMIFlags(MachineInstr::FrameSetup);
- if (NeedsWinCFI) {
- HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
- .addReg(AArch64::X16, RegState::Kill)
- .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
- .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
- .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
- .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
- .setMIFlags(MachineInstr::FrameSetup);
- if (NeedsWinCFI) {
- HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlag(MachineInstr::FrameSetup);
- }
- break;
- }
-
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
- .addReg(AArch64::SP, RegState::Kill)
- .addReg(AArch64::X15, RegState::Kill)
- .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
- .setMIFlags(MachineInstr::FrameSetup);
- if (NeedsWinCFI) {
- HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
- .addImm(NumBytes)
- .setMIFlag(MachineInstr::FrameSetup);
- }
- NumBytes = 0;
-
- if (RealignmentPadding > 0) {
- if (RealignmentPadding >= 4096) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm))
- .addReg(AArch64::X16, RegState::Define)
- .addImm(RealignmentPadding)
- .setMIFlags(MachineInstr::FrameSetup);
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXrx64), AArch64::X15)
- .addReg(AArch64::SP)
- .addReg(AArch64::X16, RegState::Kill)
- .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
- .setMIFlag(MachineInstr::FrameSetup);
- } else {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X15)
- .addReg(AArch64::SP)
- .addImm(RealignmentPadding)
- .addImm(0)
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
- .addReg(AArch64::X15, RegState::Kill)
- .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64));
- AFI->setStackRealigned(true);
-
- // No need for SEH instructions here; if we're realigning the stack,
- // we've set a frame pointer and already finished the SEH prologue.
- assert(!NeedsWinCFI);
- }
- if (X15Scratch != AArch64::NoRegister) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
- .addReg(AArch64::XZR)
- .addReg(X15Scratch, RegState::Undef)
- .addReg(X15Scratch, RegState::Implicit)
- .setMIFlag(MachineInstr::FrameSetup);
- }
- }
-
- StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
- MachineBasicBlock::iterator CalleeSavesEnd = MBBI;
-
- StackOffset CFAOffset =
- StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
-
- // Process the SVE callee-saves to determine what space needs to be
- // allocated.
- if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
- LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
- << "\n");
- SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
- SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
- // Find callee save instructions in frame.
- // Note: With FPAfterSVECalleeSaves the callee saves have already been
- // allocated.
- if (!FPAfterSVECalleeSaves) {
- MachineBasicBlock::iterator CalleeSavesBegin = MBBI;
- assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
- while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
- ++MBBI;
- CalleeSavesEnd = MBBI;
-
- StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
- // Allocate space for the callee saves (if any).
- allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
- nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
- MFI.hasVarSizedObjects() || LocalsSize);
- }
- }
- CFAOffset += SVECalleeSavesSize;
-
- if (EmitAsyncCFI)
- emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
-
- // Allocate space for the rest of the frame including SVE locals. Align the
- // stack as necessary.
- assert(!(canUseRedZone(MF) && NeedsRealignment) &&
- "Cannot use redzone with stack realignment");
- if (!canUseRedZone(MF)) {
- // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
- // the correct value here, as NumBytes also includes padding bytes,
- // which shouldn't be counted here.
- allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
- SVELocalsSize + StackOffset::getFixed(NumBytes),
- NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
- CFAOffset, MFI.hasVarSizedObjects());
- }
-
- // If we need a base pointer, set it up here. It's whatever the value of the
- // stack pointer is at this point. Any variable size objects will be allocated
- // after this, so we can still use the base pointer to reference locals.
- //
- // FIXME: Clarify FrameSetup flags here.
- // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
- // needed.
- // For funclets the BP belongs to the containing function.
- if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
- TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
- false);
- if (NeedsWinCFI) {
- HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
- .setMIFlag(MachineInstr::FrameSetup);
- }
- }
-
- // The very last FrameSetup instruction indicates the end of prologue. Emit a
- // SEH opcode indicating the prologue end.
- if (NeedsWinCFI && HasWinCFI) {
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- // SEH funclets are passed the frame pointer in X1. If the parent
- // function uses the base register, then the base register is used
- // directly, and is not retrieved from X1.
- if (IsFunclet && F.hasPersonalityFn()) {
- EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
- if (isAsynchronousEHPersonality(Per)) {
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
- .addReg(AArch64::X1)
- .setMIFlag(MachineInstr::FrameSetup);
- MBB.addLiveIn(AArch64::X1);
- }
- }
-
- if (EmitCFI && !EmitAsyncCFI) {
- if (HasFP) {
- emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject);
- } else {
- StackOffset TotalSize =
- SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
- CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
- CFIBuilder.insertCFIInst(
- createDefCFA(*RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP,
- TotalSize, /*LastAdjustmentWasScalable=*/false));
- }
- emitCalleeSavedGPRLocations(MBB, MBBI);
- emitCalleeSavedSVELocations(MBB, MBBI);
- }
+ AArch64PrologueEmitter PrologueEmitter(MF, MBB, *this);
+ PrologueEmitter.emitPrologue();
}
static bool isFuncletReturnInstr(const MachineInstr &MI) {
@@ -2548,15 +1791,15 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
if (homogeneousPrologEpilog(MF, &MBB)) {
assert(!NeedsWinCFI);
- auto LastPopI = MBB.getFirstTerminator();
- if (LastPopI != MBB.begin()) {
- auto HomogeneousEpilog = std::prev(LastPopI);
+ auto FirstHomogenousEpilogI = MBB.getFirstTerminator();
+ if (FirstHomogenousEpilogI != MBB.begin()) {
+ auto HomogeneousEpilog = std::prev(FirstHomogenousEpilogI);
if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
- LastPopI = HomogeneousEpilog;
+ FirstHomogenousEpilogI = HomogeneousEpilog;
}
// Adjust local stack
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ emitFrameOffset(MBB, FirstHomogenousEpilogI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(AFI->getLocalStackSize()), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
@@ -2602,17 +1845,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// Move past the restores of the callee-saved registers.
// If we plan on combining the sp bump of the local stack size and the callee
// save stack size, we might need to adjust the CSR save and restore offsets.
- MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
+ MachineBasicBlock::iterator FirstGPRRestoreI = MBB.getFirstTerminator();
MachineBasicBlock::iterator Begin = MBB.begin();
- while (LastPopI != Begin) {
- --LastPopI;
- if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
- (!FPAfterSVECalleeSaves && IsSVECalleeSave(LastPopI))) {
- ++LastPopI;
+ while (FirstGPRRestoreI != Begin) {
+ --FirstGPRRestoreI;
+ if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
+ (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) {
+ ++FirstGPRRestoreI;
break;
} else if (CombineSPBump)
- fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
- NeedsWinCFI, &HasWinCFI);
+ fixupCalleeSaveRestoreStackOffset(
+ *FirstGPRRestoreI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI);
}
if (NeedsWinCFI) {
@@ -2622,9 +1865,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// arguments. Insert the SEH_EpilogStart and remove it later if it
// we didn't emit any SEH opcodes to avoid generating WinCFI for
// functions that don't need it.
- BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
+ BuildMI(MBB, FirstGPRRestoreI, DL, TII->get(AArch64::SEH_EpilogStart))
.setMIFlag(MachineInstr::FrameDestroy);
- EpilogStartI = LastPopI;
+ EpilogStartI = FirstGPRRestoreI;
--EpilogStartI;
}
@@ -2665,7 +1908,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// When we are about to restore the CSRs, the CFA register is SP again.
if (EmitCFI && hasFP(MF))
- CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy)
+ CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy)
.buildDefCFA(AArch64::SP, NumBytes);
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
@@ -2681,18 +1924,19 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// Process the SVE callee-saves to determine what space needs to be
// deallocated.
StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
- MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
+ MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
+ RestoreEnd = FirstGPRRestoreI;
if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
if (FPAfterSVECalleeSaves)
RestoreEnd = MBB.getFirstTerminator();
RestoreBegin = std::prev(RestoreEnd);
while (RestoreBegin != MBB.begin() &&
- IsSVECalleeSave(std::prev(RestoreBegin)))
+ isSVECalleeSave(std::prev(RestoreBegin)))
--RestoreBegin;
- assert(IsSVECalleeSave(RestoreBegin) &&
- IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
+ assert(isSVECalleeSave(RestoreBegin) &&
+ isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
StackOffset CalleeSavedSizeAsOffset =
StackOffset::getScalable(CalleeSavedSize);
@@ -2706,7 +1950,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// deallocates non-callee-save SVE allocations. Otherwise, deallocate
// them explicitly.
if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
NeedsWinCFI, &HasWinCFI);
}
@@ -2796,7 +2040,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
StackRestoreBytes += AfterCSRPopSize;
emitFrameOffset(
- MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(StackRestoreBytes), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
@@ -2816,17 +2060,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// be able to save any instructions.
if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
emitFrameOffset(
- MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+ MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::FP,
StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
} else if (NumBytes)
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(NumBytes), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
// When we are about to restore the CSRs, the CFA register is SP again.
if (EmitCFI && hasFP(MF))
- CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy)
+ CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy)
.buildDefCFA(AArch64::SP, PrologueSaveSize);
// This must be placed after the callee-save restore code because that code
@@ -2926,8 +2170,8 @@ AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
}
-static StackOffset getFPOffset(const MachineFunction &MF,
- int64_t ObjectOffset) {
+StackOffset AArch64FrameLowering::getFPOffset(const MachineFunction &MF,
+ int64_t ObjectOffset) const {
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const Function &F = MF.getFunction();
@@ -2940,8 +2184,8 @@ static StackOffset getFPOffset(const MachineFunction &MF,
return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
}
-static StackOffset getStackOffset(const MachineFunction &MF,
- int64_t ObjectOffset) {
+StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF,
+ int64_t ObjectOffset) const {
const auto &MFI = MF.getFrameInfo();
return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
}
@@ -3139,7 +2383,8 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
return getKillRegState(!IsLiveIn);
}
-static bool produceCompactUnwindFrame(MachineFunction &MF) {
+static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL,
+ MachineFunction &MF) {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AttributeList Attrs = MF.getFunction().getAttributes();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -3147,7 +2392,7 @@ static bool produceCompactUnwindFrame(MachineFunction &MF) {
!(Subtarget.getTargetLowering()->supportSwiftError() &&
Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
MF.getFunction().getCallingConv() != CallingConv::SwiftTail &&
- !requiresSaveVG(MF) && !AFI->isSVECC();
+ !AFL.requiresSaveVG(MF) && !AFI->isSVECC();
}
static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
@@ -3244,16 +2489,18 @@ bool enableMultiVectorSpillFill(const AArch64Subtarget &Subtarget,
(!IsLocallyStreaming && Subtarget.isStreaming()));
}
-static void computeCalleeSaveRegisterPairs(
- MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
- const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
- bool NeedsFrameRecord) {
+void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
+ MachineFunction &MF,
+ ArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI,
+ SmallVectorImpl<RegPairInfo> &RegPairs,
+ bool NeedsFrameRecord) {
if (CSI.empty())
return;
bool IsWindows = isTargetWindows(MF);
- bool NeedsWinCFI = needsWinCFI(MF);
+ bool NeedsWinCFI = AFL.needsWinCFI(MF);
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned StackHazardSize = getStackHazardSize(MF);
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -3262,9 +2509,10 @@ static void computeCalleeSaveRegisterPairs(
(void)CC;
// MachO's compact unwind format relies on all registers being stored in
// pairs.
- assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
- CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
- CC == CallingConv::Win64 || (Count & 1) == 0) &&
+ assert((!produceCompactUnwindFrame(AFL, MF) ||
+ CC == CallingConv::PreserveMost || CC == CallingConv::PreserveAll ||
+ CC == CallingConv::CXX_FAST_TLS || CC == CallingConv::Win64 ||
+ (Count & 1) == 0) &&
"Odd number of callee-saved regs to spill!");
int ByteOffset = AFI->getCalleeSavedStackSize();
int StackFillDir = -1;
@@ -3380,9 +2628,9 @@ static void computeCalleeSaveRegisterPairs(
// MachO's compact unwind format relies on all registers being stored in
// adjacent register pairs.
- assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
- CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
- CC == CallingConv::Win64 ||
+ assert((!produceCompactUnwindFrame(AFL, MF) ||
+ CC == CallingConv::PreserveMost || CC == CallingConv::PreserveAll ||
+ CC == CallingConv::CXX_FAST_TLS || CC == CallingConv::Win64 ||
(RPI.isPaired() &&
((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
RPI.Reg1 + 1 == RPI.Reg2))) &&
@@ -3495,7 +2743,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
- computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
+ computeCalleeSaveRegisterPairs(*this, MF, CSI, TRI, RegPairs, hasFP(MF));
MachineRegisterInfo &MRI = MF.getRegInfo();
// Refresh the reserved regs in case there are any potential changes since the
@@ -3707,7 +2955,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
if (MBBI != MBB.end())
DL = MBBI->getDebugLoc();
- computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
+ computeCalleeSaveRegisterPairs(*this, MF, CSI, TRI, RegPairs, hasFP(MF));
if (homogeneousPrologEpilog(MF, &MBB)) {
auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
.setMIFlag(MachineInstr::FrameDestroy);
@@ -4141,7 +3389,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
if (producePairRegisters(MF)) {
if (UnspilledCSGPRPaired == AArch64::NoRegister) {
// Failed to make a pair for compact unwind format, revert spilling.
- if (produceCompactUnwindFrame(MF)) {
+ if (produceCompactUnwindFrame(*this, MF)) {
SavedRegs.reset(UnspilledCSGPR);
ExtraCSSpill = AArch64::NoRegister;
}
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 555a93359c27..a9d65441a4e3 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -19,6 +19,10 @@
namespace llvm {
+class TargetLowering;
+class AArch64FunctionInfo;
+class AArch64PrologueEmitter;
+
class AArch64FrameLowering : public TargetFrameLowering {
public:
explicit AArch64FrameLowering()
@@ -130,12 +134,19 @@ public:
return StackId != TargetStackID::ScalableVector;
}
+ friend class AArch64PrologueEmitter;
void
orderFrameObjects(const MachineFunction &MF,
SmallVectorImpl<int> &ObjectsToAllocate) const override;
bool isFPReserved(const MachineFunction &MF) const;
+ bool needsWinCFI(const MachineFunction &MF) const;
+
+ bool requiresSaveVG(const MachineFunction &MF) const;
+
+ StackOffset getSVEStackSize(const MachineFunction &MF) const;
+
protected:
bool hasFPImpl(const MachineFunction &MF) const override;
@@ -159,10 +170,6 @@ private:
int &MaxCSFrameIndex) const;
bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
uint64_t StackBumpBytes) const;
- void emitCalleeSavedGPRLocations(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI) const;
- void emitCalleeSavedSVELocations(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI) const;
void emitCalleeSavedGPRRestores(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const;
void emitCalleeSavedSVERestores(MachineBasicBlock &MBB,
@@ -196,6 +203,61 @@ private:
void emitRemarks(const MachineFunction &MF,
MachineOptimizationRemarkEmitter *ORE) const override;
+
+ bool windowsRequiresStackProbe(const MachineFunction &MF,
+ uint64_t StackSizeInBytes) const;
+
+ bool shouldSignReturnAddressEverywhere(const MachineFunction &MF) const;
+
+ StackOffset getFPOffset(const MachineFunction &MF,
+ int64_t ObjectOffset) const;
+
+ StackOffset getStackOffset(const MachineFunction &MF,
+ int64_t ObjectOffset) const;
+
+ // Find a scratch register that we can use at the start of the prologue to
+ // re-align the stack pointer. We avoid using callee-save registers since
+ // they may appear to be free when this is called from canUseAsPrologue
+ // (during shrink wrapping), but then no longer be free when this is called
+ // from emitPrologue.
+ //
+ // FIXME: This is a bit conservative, since in the above case we could use one
+ // of the callee-save registers as a scratch temp to re-align the stack
+ // pointer, but we would then have to make sure that we were in fact saving at
+ // least one callee-save register in the prologue, which is additional
+ // complexity that doesn't seem worth the benefit.
+ Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
+ bool HasCall = false) const;
+
+ // Convert callee-save register save/restore instruction to do stack pointer
+ // decrement/increment to allocate/deallocate the callee-save stack area by
+ // converting store/load to use pre/post increment version.
+ MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
+ bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
+ MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
+ int CFAOffset = 0) const;
+
+ // Fixup callee-save register save/restore instructions to take into account
+ // combined SP bump by adding the local stack size to the stack offsets.
+ void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
+ uint64_t LocalStackSize,
+ bool NeedsWinCFI,
+ bool *HasWinCFI) const;
+
+ bool isSVECalleeSave(MachineBasicBlock::iterator I) const;
+
+ /// Returns the size of the fixed object area (allocated next to sp on entry)
+ /// On Win64 this may include a var args area and an UnwindHelp object for EH.
+ unsigned getFixedObjectSize(const MachineFunction &MF,
+ const AArch64FunctionInfo *AFI, bool IsWin64,
+ bool IsFunclet) const;
+
+ bool isVGInstruction(MachineBasicBlock::iterator MBBI,
+ const TargetLowering &TLI) const;
+
+ bool requiresGetVGCall(const MachineFunction &MF) const;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index bc786f415b55..6fdc981fc21a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -246,9 +246,9 @@ public:
return false;
}
- template<MVT::SimpleValueType VT>
+ template <MVT::SimpleValueType VT, bool Negate>
bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
- return SelectSVEAddSubImm(N, VT, Imm, Shift);
+ return SelectSVEAddSubImm(N, VT, Imm, Shift, Negate);
}
template <MVT::SimpleValueType VT, bool Negate>
@@ -489,7 +489,8 @@ private:
bool SelectCMP_SWAP(SDNode *N);
- bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
+ bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift,
+ bool Negate);
bool SelectSVEAddSubSSatImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift,
bool Negate);
bool SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
@@ -4227,35 +4228,36 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
}
bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm,
- SDValue &Shift) {
+ SDValue &Shift, bool Negate) {
if (!isa<ConstantSDNode>(N))
return false;
SDLoc DL(N);
- uint64_t Val = cast<ConstantSDNode>(N)
- ->getAPIntValue()
- .trunc(VT.getFixedSizeInBits())
- .getZExtValue();
+ APInt Val =
+ cast<ConstantSDNode>(N)->getAPIntValue().trunc(VT.getFixedSizeInBits());
+
+ if (Negate)
+ Val = -Val;
switch (VT.SimpleTy) {
case MVT::i8:
// All immediates are supported.
Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(Val.getZExtValue(), DL, MVT::i32);
return true;
case MVT::i16:
case MVT::i32:
case MVT::i64:
// Support 8bit unsigned immediates.
- if (Val <= 255) {
+ if ((Val & ~0xff) == 0) {
Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(Val.getZExtValue(), DL, MVT::i32);
return true;
}
// Support 16bit unsigned immediates that are a multiple of 256.
- if (Val <= 65280 && Val % 256 == 0) {
+ if ((Val & ~0xff00) == 0) {
Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
- Imm = CurDAG->getTargetConstant(Val >> 8, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(Val.lshr(8).getZExtValue(), DL, MVT::i32);
return true;
}
break;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d70a46b0e893..5ffaf2c49b4c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1179,6 +1179,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
@@ -1918,6 +1919,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
+ // Handle non-aliasing elements mask
+ if (Subtarget->hasSVE2() ||
+ (Subtarget->hasSME() && Subtarget->isStreaming())) {
+ // FIXME: Support wider fixed-length types when msve-vector-bits is used.
+ for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
+ setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom);
+ setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom);
+ }
+ for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
+ setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom);
+ setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom);
+ }
+ }
+
// Handle operations that are only available in non-streaming SVE mode.
if (Subtarget->isSVEAvailable()) {
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
@@ -2585,6 +2600,30 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
Known = Known.intersectWith(Known2);
break;
}
+ case AArch64ISD::CSNEG:
+ case AArch64ISD::CSINC:
+ case AArch64ISD::CSINV: {
+ KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+ KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
+
+ // The result is either:
+ // CSINC: KnownOp0 or KnownOp1 + 1
+ // CSINV: KnownOp0 or ~KnownOp1
+ // CSNEG: KnownOp0 or KnownOp1 * -1
+ if (Op.getOpcode() == AArch64ISD::CSINC)
+ KnownOp1 = KnownBits::add(
+ KnownOp1,
+ KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
+ else if (Op.getOpcode() == AArch64ISD::CSINV)
+ std::swap(KnownOp1.Zero, KnownOp1.One);
+ else if (Op.getOpcode() == AArch64ISD::CSNEG)
+ KnownOp1 =
+ KnownBits::mul(KnownOp1, KnownBits::makeConstant(APInt::getAllOnes(
+ Op.getScalarValueSizeInBits())));
+
+ Known = KnownOp0.intersectWith(KnownOp1);
+ break;
+ }
case AArch64ISD::BICi: {
// Compute the bit cleared value.
APInt Mask =
@@ -2626,6 +2665,32 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
<< Op->getConstantOperandVal(1)));
break;
}
+ case AArch64ISD::MOVImsl: {
+ unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
+ Known = KnownBits::makeConstant(APInt(
+ Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
+ break;
+ }
+ case AArch64ISD::MOVIedit: {
+ Known = KnownBits::makeConstant(APInt(
+ Known.getBitWidth(),
+ AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
+ break;
+ }
+ case AArch64ISD::MVNIshift: {
+ Known = KnownBits::makeConstant(
+ APInt(Known.getBitWidth(),
+ ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
+ /*isSigned*/ false, /*implicitTrunc*/ true));
+ break;
+ }
+ case AArch64ISD::MVNImsl: {
+ unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
+ Known = KnownBits::makeConstant(
+ APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
+ /*isSigned*/ false, /*implicitTrunc*/ true));
+ break;
+ }
case AArch64ISD::LOADgot:
case AArch64ISD::ADDlow: {
if (!Subtarget->isTargetILP32())
@@ -2984,21 +3049,20 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
if (TPIDR2.Uses > 0) {
+ // Note: This case just needs to do `SVL << 48`. It is not implemented as we
+ // generally don't support big-endian SVE/SME.
+ if (!Subtarget->isLittleEndian())
+ reportFatalInternalError(
+ "TPIDR2 block initialization is not supported on big-endian targets");
+
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- // Store the buffer pointer to the TPIDR2 stack object.
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+ // Store buffer pointer and num_za_save_slices.
+ // Bytes 10-15 are implicitly zeroed.
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
.addReg(MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(1).getReg())
.addFrameIndex(TPIDR2.FrameIndex)
.addImm(0);
- // Set the reserved bytes (10-15) to zero
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
- .addReg(AArch64::WZR)
- .addFrameIndex(TPIDR2.FrameIndex)
- .addImm(5);
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
- .addReg(AArch64::WZR)
- .addFrameIndex(TPIDR2.FrameIndex)
- .addImm(3);
} else
MFI.RemoveStackObject(TPIDR2.FrameIndex);
@@ -3111,21 +3175,24 @@ MachineBasicBlock *
AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
Register ResultReg = MI.getOperand(0).getReg();
- if (FuncInfo->isPStateSMRegUsed()) {
+ if (MF->getRegInfo().use_empty(ResultReg)) {
+ // Nothing to do. Pseudo erased below.
+ } else if (Subtarget->hasSME()) {
+ BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
+ .addImm(AArch64SysReg::SVCR)
+ .addReg(AArch64::VG, RegState::Implicit);
+ } else {
RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
+ BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
.addExternalSymbol(getLibcallName(LC))
.addReg(AArch64::X0, RegState::ImplicitDefine)
.addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg)
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
.addReg(AArch64::X0);
- } else {
- assert(MI.getMF()->getRegInfo().use_empty(ResultReg) &&
- "Expected no users of the entry pstate.sm!");
}
MI.eraseFromParent();
return BB;
@@ -4912,6 +4979,18 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
if (DstWidth < SatWidth)
return SDValue();
+ if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
+ if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
+ SDValue CVTf32 =
+ DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
+ SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
+ DAG.getValueType(SatVT));
+ }
+ SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
+ return DAG.getBitcast(DstVT, CVTf32);
+ }
+
SDValue NativeCvt =
DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
SDValue Sat;
@@ -5242,6 +5321,56 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
static MVT getSVEContainerType(EVT ContentTy);
+SDValue
+AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ uint64_t EltSize = Op.getConstantOperandVal(2);
+ EVT VT = Op.getValueType();
+ switch (EltSize) {
+ case 1:
+ if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
+ return SDValue();
+ break;
+ case 2:
+ if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
+ return SDValue();
+ break;
+ case 4:
+ if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
+ return SDValue();
+ break;
+ case 8:
+ if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
+ return SDValue();
+ break;
+ default:
+ // Other element sizes are incompatible with whilewr/rw, so expand instead
+ return SDValue();
+ }
+
+ SDValue PtrA = Op.getOperand(0);
+ SDValue PtrB = Op.getOperand(1);
+
+ if (VT.isScalableVT())
+ return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
+
+ // We can use the SVE whilewr/whilerw instruction to lower this
+ // intrinsic by creating the appropriate sequence of scalable vector
+ // operations and then extracting a fixed-width subvector from the scalable
+ // vector. Scalable vector variants are already legal.
+ EVT ContainerVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorNumElements(), true);
+ EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
+
+ SDValue Mask =
+ DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
+ SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
+ DAG.getVectorIdxConstant(0, DL));
+}
+
SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
SelectionDAG &DAG) const {
EVT OpVT = Op.getValueType();
@@ -6000,6 +6129,38 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
}
+ case Intrinsic::aarch64_sve_whilewr_b:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(1, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilewr_h:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(2, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilewr_s:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(4, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilewr_d:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(8, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilerw_b:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(1, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilerw_h:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(2, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilerw_s:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(4, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilerw_d:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(8, DL, MVT::i64));
case Intrinsic::aarch64_neon_abs: {
EVT Ty = Op.getValueType();
if (Ty == MVT::i64) {
@@ -7359,6 +7520,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
default:
llvm_unreachable("unimplemented operand");
return SDValue();
+ case ISD::LOOP_DEPENDENCE_RAW_MASK:
+ case ISD::LOOP_DEPENDENCE_WAR_MASK:
+ return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
case ISD::BITCAST:
return LowerBITCAST(Op, DAG);
case ISD::GlobalAddress:
@@ -7873,6 +8037,39 @@ static bool isPassedInFPR(EVT VT) {
(VT.isFloatingPoint() && !VT.isScalableVector());
}
+SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
+ SelectionDAG &DAG) const {
+ assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
+ SDValue Glue = Chain.getValue(1);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
+
+ // The following conditions are true on entry to an exception handler:
+ // - PSTATE.SM is 0.
+ // - PSTATE.ZA is 0.
+ // - TPIDR2_EL0 is null.
+ // See:
+ // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
+ //
+ // Therefore, if the function that contains this exception handler is a
+ // streaming[-compatible] function, we must re-enable streaming mode.
+ //
+ // These mode changes are usually optimized away in catch blocks as they
+ // occur before the __cxa_begin_catch (which is a non-streaming function),
+ // but are necessary in some cases (such as for cleanups).
+
+ if (SMEFnAttrs.hasStreamingInterfaceOrBody())
+ return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
+ /*Glue*/ Glue, AArch64SME::Always);
+
+ if (SMEFnAttrs.hasStreamingCompatibleInterface())
+ return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
+ AArch64SME::IfCallerIsStreaming);
+
+ return Chain;
+}
+
SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -8292,7 +8489,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
- if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
+ if (getTM().useNewSMEABILowering()) {
+ if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
+ SDValue Size;
+ if (Attrs.hasZAState()) {
+ SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getConstant(1, DL, MVT::i32));
+ Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+ } else if (Attrs.hasAgnosticZAInterface()) {
+ RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
+ SDValue Callee = DAG.getExternalSymbol(
+ getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
+ auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ getLibcallCallingConv(LC), RetTy, Callee, {});
+ std::tie(Size, Chain) = LowerCallTo(CLI);
+ }
+ if (Size) {
+ SDValue Buffer = DAG.getNode(
+ ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
+ {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
+ Chain = Buffer.getValue(1);
+
+ Register BufferPtr =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
+ Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
+ DAG.getVTList(MVT::Other), Chain);
+ FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+ }
+ } else {
// Old SME ABI lowering (deprecated):
// Create a 16 Byte TPIDR2 object. The dynamic buffer
// will be expanded and stored in the static object later using a
@@ -8313,9 +8542,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
MFI.CreateVariableSizedObject(Align(16), nullptr);
}
+ SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getConstant(1, DL, MVT::i32));
Chain = DAG.getNode(
AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
- {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
+ {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
+ /*Num save slices*/ NumZaSaveSlices});
} else if (Attrs.hasAgnosticZAInterface()) {
// Call __arm_sme_state_size().
SDValue BufferSize =
@@ -8338,7 +8570,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
Register BufferPtr =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
FuncInfo->setSMESaveBufferAddr(BufferPtr);
- Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
+ Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
}
}
@@ -8905,7 +9137,6 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
SmallVector<SDValue> Ops = {Chain, MSROp};
unsigned Opcode;
if (Condition != AArch64SME::Always) {
- FuncInfo->setPStateSMRegUsed(true);
Register PStateReg = FuncInfo->getPStateSMReg();
assert(PStateReg.isValid() && "PStateSM Register is invalid");
SDValue PStateSM =
@@ -9078,17 +9309,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Determine whether we need any streaming mode changes.
SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+
+ std::optional<unsigned> ZAMarkerNode;
bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
- bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
- auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
- // TODO: Handle agnostic ZA functions.
- if (!UseNewSMEABILowering || IsAgnosticZAFunction)
- return std::nullopt;
- if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
- return std::nullopt;
- return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
- : AArch64ISD::INOUT_ZA_USE;
- }();
+ if (UseNewSMEABILowering) {
+ if (CallAttrs.requiresLazySave() ||
+ CallAttrs.requiresPreservingAllZAState())
+ ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
+ else if (CallAttrs.caller().hasZAState() ||
+ CallAttrs.caller().hasZT0State())
+ ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
+ }
if (IsTailCall) {
// Check if it's really possible to do a tail call.
@@ -9163,21 +9394,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
};
bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
- bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
+ bool RequiresSaveAllZA =
+ !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
if (RequiresLazySave) {
- const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
- MachinePointerInfo MPI =
- MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
TPIDR2.FrameIndex,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
- SDValue NumZaSaveSlicesAddr =
- DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
- DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
- SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
- DAG.getConstant(1, DL, MVT::i32));
- Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
- MPI, MVT::i16);
Chain = DAG.getNode(
ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
@@ -17599,14 +17822,16 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
- unsigned Factor) const {
+ unsigned Factor,
+ const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
- assert(!LaneMask && "Unexpected mask on store");
+ assert(!LaneMask && GapMask.popcount() == Factor &&
+ "Unexpected mask on store");
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
@@ -20868,13 +21093,6 @@ static bool isNegatedInteger(SDValue Op) {
return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
}
-static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- SDValue Zero = DAG.getConstant(0, DL, VT);
- return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
-}
-
// Try to fold
//
// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
@@ -20893,16 +21111,17 @@ static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = CSel.getOperand(0);
SDValue N1 = CSel.getOperand(1);
- // If both of them is not negations, it's not worth the folding as it
+ // If neither of them are negations, it's not worth the folding as it
// introduces two additional negations while reducing one negation.
if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
return SDValue();
- SDValue N0N = getNegatedInteger(N0, DAG);
- SDValue N1N = getNegatedInteger(N1, DAG);
-
SDLoc DL(N);
EVT VT = CSel.getValueType();
+
+ SDValue N0N = DAG.getNegative(N0, DL, VT);
+ SDValue N1N = DAG.getNegative(N1, DL, VT);
+
return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
CSel.getOperand(3));
}
@@ -22087,10 +22306,14 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
}
+ unsigned PTest = AArch64ISD::PTEST;
+ if (Cond == AArch64CC::ANY_ACTIVE)
+ PTest = AArch64ISD::PTEST_ANY;
+ else if (Cond == AArch64CC::FIRST_ACTIVE)
+ PTest = AArch64ISD::PTEST_FIRST;
+
// Set condition code (CC) flags.
- SDValue Test = DAG.getNode(
- Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
- DL, MVT::i32, Pg, Op);
+ SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
// Convert CC to integer based on requested condition.
// NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
@@ -22158,6 +22381,17 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
Zero);
}
+static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode,
+ SelectionDAG &DAG) {
+ if (N->getValueType(0) != MVT::i16)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
+ SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
+}
+
// If a merged operation has no inactive lanes we can relax it to a predicated
// or unpredicated operation, which potentially allows better isel (perhaps
// using immediate forms) or relaxing register reuse requirements.
@@ -22411,6 +22645,26 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_uabd:
return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fcvtzs:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtzu:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtas:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtau:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtms:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtmu:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtns:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtnu:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtps:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtpu:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
case Intrinsic::aarch64_crc32b:
case Intrinsic::aarch64_crc32cb:
return tryCombineCRC32(0xff, N, DAG);
@@ -22419,7 +22673,7 @@ static SDValue performIntrinsicCombine(SDNode *N,
return tryCombineCRC32(0xffff, N, DAG);
case Intrinsic::aarch64_sve_saddv:
// There is no i64 version of SADDV because the sign is irrelevant.
- if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
+ if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
else
return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
@@ -24106,6 +24360,7 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
// Ensure that all elements' bits are either 0s or 1s.
ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
SmallVector<SDValue, 16> MaskConstants;
if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
VecVT == MVT::v16i8) {
@@ -24113,7 +24368,10 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
// per entry. We split it into two halves, apply the mask, zip the halves to
// create 8x 16-bit values, and the perform the vector reduce.
for (unsigned Half = 0; Half < 2; ++Half) {
- for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
+ for (unsigned I = 0; I < 8; ++I) {
+ // On big-endian targets, the lane order in sub-byte vector elements
+ // gets reversed, so we need to flip the bit index.
+ unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
}
}
@@ -24131,8 +24389,9 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
}
// All other vector sizes.
- unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
- for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
+ unsigned NumEl = VecVT.getVectorNumElements();
+ for (unsigned I = 0; I < NumEl; ++I) {
+ unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
}
@@ -24444,6 +24703,105 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
}
+static bool
+isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+ if (N->getOpcode() != ISD::CONCAT_VECTORS)
+ return false;
+
+ unsigned NumParts = N->getNumOperands();
+
+ // We should be concatenating each sequential result from a
+ // VECTOR_INTERLEAVE.
+ SDNode *InterleaveOp = N->getOperand(0).getNode();
+ if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
+ InterleaveOp->getNumOperands() != NumParts)
+ return false;
+
+ for (unsigned I = 0; I < NumParts; I++)
+ if (N->getOperand(I) != SDValue(InterleaveOp, I))
+ return false;
+
+ Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
+ return true;
+}
+
+static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
+ SDValue WideMask,
+ unsigned RequiredNumParts) {
+ if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
+ SmallVector<SDValue, 4> MaskInterleaveOps;
+ if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
+ MaskInterleaveOps))
+ return SDValue();
+
+ if (MaskInterleaveOps.size() != RequiredNumParts)
+ return SDValue();
+
+ // Make sure the inputs to the vector interleave are identical.
+ if (!llvm::all_equal(MaskInterleaveOps))
+ return SDValue();
+
+ return MaskInterleaveOps[0];
+ }
+
+ if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
+ return SDValue();
+
+ ElementCount EC = WideMask.getValueType().getVectorElementCount();
+ assert(EC.isKnownMultipleOf(RequiredNumParts) &&
+ "Expected element count divisible by number of parts");
+ EC = EC.divideCoefficientBy(RequiredNumParts);
+ return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
+ WideMask->getOperand(0));
+}
+
+static SDValue performInterleavedMaskedStoreCombine(
+ SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+ SDValue WideValue = MST->getValue();
+
+ // Bail out if the stored value has an unexpected number of uses, since we'll
+ // have to perform manual interleaving and may as well just use normal masked
+ // stores. Also, discard masked stores that are truncating or indexed.
+ if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
+ !MST->isSimple() || !MST->getOffset().isUndef())
+ return SDValue();
+
+ SmallVector<SDValue, 4> ValueInterleaveOps;
+ if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
+ ValueInterleaveOps))
+ return SDValue();
+
+ unsigned NumParts = ValueInterleaveOps.size();
+ if (NumParts != 2 && NumParts != 4)
+ return SDValue();
+
+ // At the moment we're unlikely to see a fixed-width vector interleave as
+ // we usually generate shuffles instead.
+ EVT SubVecTy = ValueInterleaveOps[0].getValueType();
+ if (!SubVecTy.isScalableVT() ||
+ SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue NarrowMask =
+ getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
+ if (!NarrowMask)
+ return SDValue();
+
+ const Intrinsic::ID IID =
+ NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
+ SmallVector<SDValue, 8> NewStOps;
+ NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
+ NewStOps.append(ValueInterleaveOps);
+ NewStOps.append({NarrowMask, MST->getBasePtr()});
+ return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
+}
+
static SDValue performMSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -24453,6 +24811,9 @@ static SDValue performMSTORECombine(SDNode *N,
SDValue Mask = MST->getMask();
SDLoc DL(N);
+ if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
+ return Res;
+
// If this is a UZP1 followed by a masked store, fold this into a masked
// truncating store. We can do this even if this is already a masked
// truncstore.
@@ -26523,6 +26884,26 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
}
+ // Sign extend of CSET -> CSETM.
+ if (Opc == AArch64ISD::CSEL &&
+ cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
+ EVT VT = N->getValueType(0);
+ SDValue TVal = Src.getOperand(0);
+ SDValue FVal = Src.getOperand(1);
+
+ // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
+ if (isNullConstant(TVal) && isOneConstant(FVal))
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
+ DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
+ Src.getOperand(3));
+
+ // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
+ if (isOneConstant(TVal) && isNullConstant(FVal))
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT,
+ DAG.getAllOnesConstant(DL, VT), FVal,
+ Src.getOperand(2), Src.getOperand(3));
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -27020,6 +27401,83 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return NVCAST;
}
+static SDValue performVectorDeinterleaveCombine(
+ SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ unsigned NumParts = N->getNumOperands();
+ if (NumParts != 2 && NumParts != 4)
+ return SDValue();
+
+ EVT SubVecTy = N->getValueType(0);
+
+ // At the moment we're unlikely to see a fixed-width vector deinterleave as
+ // we usually generate shuffles instead.
+ unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
+ if (!SubVecTy.isScalableVector() ||
+ SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
+ return SDValue();
+
+ // Make sure each input operand is the correct extract_subvector of the same
+ // wider vector.
+ SDValue Op0 = N->getOperand(0);
+ for (unsigned I = 0; I < NumParts; I++) {
+ SDValue OpI = N->getOperand(I);
+ if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ OpI->getOperand(0) != Op0->getOperand(0))
+ return SDValue();
+ if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
+ return SDValue();
+ }
+
+ // Normal loads are currently already handled by the InterleavedAccessPass so
+ // we don't expect to see them here. Bail out if the masked load has an
+ // unexpected number of uses, since we want to avoid a situation where we have
+ // both deinterleaving loads and normal loads in the same block. Also, discard
+ // masked loads that are extending, indexed, have an unexpected offset or have
+ // an unsupported passthru value until we find a valid use case.
+ auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
+ if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
+ !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
+ !MaskedLoad->getOffset().isUndef() ||
+ (!MaskedLoad->getPassThru()->isUndef() &&
+ !isZerosVector(MaskedLoad->getPassThru().getNode())))
+ return SDValue();
+
+ // Now prove that the mask is an interleave of identical masks.
+ SDLoc DL(N);
+ SDValue NarrowMask =
+ getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
+ if (!NarrowMask)
+ return SDValue();
+
+ const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
+ : Intrinsic::aarch64_sve_ld4_sret;
+ SDValue NewLdOps[] = {MaskedLoad->getChain(),
+ DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
+ MaskedLoad->getBasePtr()};
+ SDValue Res;
+ if (NumParts == 2)
+ Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+ {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
+ else
+ Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+ {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
+ NewLdOps);
+
+ // We can now generate a structured load!
+ SmallVector<SDValue, 4> ResOps(NumParts);
+ for (unsigned Idx = 0; Idx < NumParts; Idx++)
+ ResOps[Idx] = SDValue(Res.getNode(), Idx);
+
+ // Replace uses of the original chain result with the new chain result.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
+ SDValue(Res.getNode(), NumParts));
+ return DCI.CombineTo(N, ResOps, false);
+}
+
/// If the operand is a bitwise AND with a constant RHS, and the shift has a
/// constant RHS and is the only use, we can pull it out of the shift, i.e.
///
@@ -27088,6 +27546,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
default:
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;
+ case ISD::VECTOR_DEINTERLEAVE:
+ return performVectorDeinterleaveCombine(N, DCI, DAG);
case ISD::VECREDUCE_AND:
case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:
@@ -30640,10 +31100,41 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
}
+bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
+
+ // TODO: Add more target nodes.
+ switch (Op.getOpcode()) {
+ case AArch64ISD::MOVI:
+ case AArch64ISD::MOVIedit:
+ case AArch64ISD::MOVImsl:
+ case AArch64ISD::MOVIshift:
+ case AArch64ISD::MVNImsl:
+ case AArch64ISD::MVNIshift:
+ case AArch64ISD::VASHR:
+ case AArch64ISD::VLSHR:
+ case AArch64ISD::VSHL:
+ return false;
+ }
+ return TargetLowering::canCreateUndefOrPoisonForTargetNode(
+ Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
+}
+
bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
return Op.getOpcode() == AArch64ISD::DUP ||
Op.getOpcode() == AArch64ISD::MOVI ||
Op.getOpcode() == AArch64ISD::MOVIshift ||
+ Op.getOpcode() == AArch64ISD::MOVImsl ||
+ Op.getOpcode() == AArch64ISD::MOVIedit ||
+ Op.getOpcode() == AArch64ISD::MVNIshift ||
+ Op.getOpcode() == AArch64ISD::MVNImsl ||
+ // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
+ // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
+ // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
+ (Op.getOpcode() == ISD::FNEG &&
+ Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
+ Op.getOperand(0).getConstantOperandVal(0) == 0) ||
(Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
TargetLowering::isTargetCanonicalConstantNode(Op);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6c6ae782f779..f5d14905cac6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -233,8 +233,8 @@ public:
ArrayRef<unsigned> Indices, unsigned Factor,
const APInt &GapMask) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
- ShuffleVectorInst *SVI,
- unsigned Factor) const override;
+ ShuffleVectorInst *SVI, unsigned Factor,
+ const APInt &GapMask) const override;
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
IntrinsicInst *DI) const override;
@@ -575,6 +575,9 @@ private:
bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override;
+ SDValue lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
+ SelectionDAG &DAG) const override;
+
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -735,6 +738,7 @@ private:
SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
@@ -868,6 +872,12 @@ private:
TargetLoweringOpt &TLO,
unsigned Depth) const override;
+ bool canCreateUndefOrPoisonForTargetNode(SDValue Op,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ bool PoisonOnly, bool ConsiderFlags,
+ unsigned Depth) const override;
+
bool isTargetCanonicalConstantNode(SDValue Op) const override;
// With the exception of data-predicate transitions, no instructions are
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 178dab689739..8958ad129269 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1327,6 +1327,8 @@ def move_vec_shift : Operand<i32> {
let PrintMethod = "printShifter";
let EncoderMethod = "getMoveVecShifterOpValue";
let ParserMatchClass = MoveVecShifterOperand;
+ let OperandType = "OPERAND_SHIFT_MSL";
+ let OperandNamespace = "AArch64";
}
let DiagnosticType = "AddSubSecondSource" in {
@@ -3032,8 +3034,12 @@ class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
// Aliases for register+register add/subtract.
class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
- RegisterClass src1Regtype, RegisterClass src2Regtype,
- int shiftExt>
+ RegisterClass src1Regtype, dag src2>
+ : InstAlias<asm#"\t$dst, $src1, $src2",
+ (inst dstRegtype:$dst, src1Regtype:$src1, src2)>;
+class AddSubRegAlias64<string asm, Instruction inst, RegisterClass dstRegtype,
+ RegisterClass src1Regtype, RegisterClass src2Regtype,
+ int shiftExt>
: InstAlias<asm#"\t$dst, $src1, $src2",
(inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
shiftExt)>;
@@ -3101,22 +3107,22 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
// Register/register aliases with no shift when SP is not used.
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
- GPR32, GPR32, GPR32, 0>;
+ GPR32, GPR32, (arith_shifted_reg32 GPR32:$src2, 0)>;
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
- GPR64, GPR64, GPR64, 0>;
+ GPR64, GPR64, (arith_shifted_reg64 GPR64:$src2, 0)>;
// Register/register aliases with no shift when either the destination or
// first source register is SP.
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
- GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+ GPR32sponly, GPR32sp,
+ (arith_extended_reg32_i32 GPR32:$src2, 16)>; // UXTW #0
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
- GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
- def : AddSubRegAlias<mnemonic,
- !cast<Instruction>(NAME#"Xrx64"),
- GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
- def : AddSubRegAlias<mnemonic,
- !cast<Instruction>(NAME#"Xrx64"),
- GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
+ GPR32sp, GPR32sponly,
+ (arith_extended_reg32_i32 GPR32:$src2, 16)>; // UXTW #0
+ def : AddSubRegAlias64<mnemonic, !cast<Instruction>(NAME#"Xrx64"),
+ GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
+ def : AddSubRegAlias64<mnemonic, !cast<Instruction>(NAME#"Xrx64"),
+ GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
}
multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
@@ -3180,15 +3186,19 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
- WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+ WZR, GPR32sp:$src1,
+ (arith_extended_reg32_i32 GPR32:$src2, arith_extend:$sh)), 4>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
- XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+ XZR, GPR64sp:$src1,
+ (arith_extended_reg32_i64 GPR32:$src2, arith_extend:$sh)), 4>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
- WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+ WZR, GPR32:$src1,
+ (arith_shifted_reg32 GPR32:$src2, arith_shift32:$sh)), 4>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
- XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+ XZR, GPR64:$src1,
+ (arith_shifted_reg64 GPR64:$src2, arith_shift64:$sh)), 4>;
// Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
@@ -3198,27 +3208,28 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
// Compare shorthands
def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs")
- WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+ WZR, GPR32:$src1, (arith_shifted_reg32 GPR32:$src2, 0)), 5>;
def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs")
- XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+ XZR, GPR64:$src1, (arith_shifted_reg64 GPR64:$src2, 0)), 5>;
def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx")
- WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
+ WZR, GPR32sponly:$src1,
+ (arith_extended_reg32_i32 GPR32:$src2, 16)), 5>;
def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
// Register/register aliases with no shift when SP is not used.
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
- GPR32, GPR32, GPR32, 0>;
+ GPR32, GPR32, (arith_shifted_reg32 GPR32:$src2, 0)>;
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
- GPR64, GPR64, GPR64, 0>;
+ GPR64, GPR64, (arith_shifted_reg64 GPR64:$src2, 0)>;
// Register/register aliases with no shift when the first source register
// is SP.
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
- GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
- def : AddSubRegAlias<mnemonic,
- !cast<Instruction>(NAME#"Xrx64"),
- GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+ GPR32, GPR32sponly,
+ (arith_extended_reg32_i32 GPR32:$src2, 16)>; // UXTW #0
+ def : AddSubRegAlias64<mnemonic, !cast<Instruction>(NAME#"Xrx64"),
+ GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
}
class AddSubG<bit isSub, string asm_inst, SDPatternOperator OpNode>
@@ -3403,9 +3414,10 @@ class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
}
// Aliases for register+register logical instructions.
-class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype,
+ dag op2>
: InstAlias<asm#"\t$dst, $src1, $src2",
- (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+ (inst regtype:$dst, regtype:$src1, op2)>;
multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
string Alias> {
@@ -3477,10 +3489,10 @@ multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
let Inst{31} = 1;
}
- def : LogicalRegAlias<mnemonic,
- !cast<Instruction>(NAME#"Wrs"), GPR32>;
- def : LogicalRegAlias<mnemonic,
- !cast<Instruction>(NAME#"Xrs"), GPR64>;
+ def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+ GPR32, (logical_shifted_reg32 GPR32:$src2, 0)>;
+ def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+ GPR64, (logical_shifted_reg64 GPR64:$src2, 0)>;
}
// Split from LogicalReg to allow setting NZCV Defs
@@ -3500,10 +3512,10 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
}
} // Defs = [NZCV]
- def : LogicalRegAlias<mnemonic,
- !cast<Instruction>(NAME#"Wrs"), GPR32>;
- def : LogicalRegAlias<mnemonic,
- !cast<Instruction>(NAME#"Xrs"), GPR64>;
+ def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+ GPR32, (logical_shifted_reg32 GPR32:$src2, 0)>;
+ def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+ GPR64, (logical_shifted_reg64 GPR64:$src2, 0)>;
}
//---
@@ -3991,9 +4003,10 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
let Inst{4-0} = Rt;
}
-class ROInstAlias<string asm, DAGOperand regtype, Instruction INST>
+class ROInstAlias<string asm, DAGOperand regtype, Instruction INST,
+ ro_extend ext>
: InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
- (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+ (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, (ext 0, 0))>;
multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
@@ -4019,7 +4032,7 @@ multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend8>;
}
multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4044,7 +4057,7 @@ multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend8>;
}
class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
@@ -4091,7 +4104,7 @@ multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend16>;
}
multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4116,7 +4129,7 @@ multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend16>;
}
class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
@@ -4163,7 +4176,7 @@ multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend32>;
}
multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4188,7 +4201,7 @@ multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend32>;
}
class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
@@ -4235,7 +4248,7 @@ multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend64>;
}
multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4260,7 +4273,7 @@ multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend64>;
}
class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
@@ -4307,7 +4320,7 @@ multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend128>;
}
multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4328,7 +4341,7 @@ multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
let Inst{13} = 0b1;
}
- def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+ def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend128>;
}
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
@@ -4377,9 +4390,7 @@ multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
let Inst{13} = 0b1;
}
- def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
- (!cast<Instruction>(NAME # "roX") prfop:$Rt,
- GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+ def : ROInstAlias<"prfm", prfop, !cast<Instruction>(NAME # "roX"), ro_Xextend64>;
}
//---
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index db028b4b7677..e56fe90259d5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -91,8 +91,8 @@ static cl::opt<unsigned> GatherOptSearchLimit(
"machine-combiner gather pattern optimization"));
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
- : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
- AArch64::CATCHRET),
+ : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN,
+ AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
/// GetInstSize - Return the number of bytes of code the specified
@@ -1299,6 +1299,7 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
break;
case AArch64::PTEST_PP:
case AArch64::PTEST_PP_ANY:
+ case AArch64::PTEST_PP_FIRST:
SrcReg = MI.getOperand(0).getReg();
SrcReg2 = MI.getOperand(1).getReg();
if (MI.getOperand(2).getSubReg())
@@ -1691,7 +1692,8 @@ bool AArch64InstrInfo::optimizeCompareInstr(
}
if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
- CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
+ CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
+ CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
if (SrcReg2 != 0)
@@ -5075,7 +5077,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
}
- } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
+ } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@@ -5202,7 +5204,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(KillSrc))
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
- } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
+ } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@@ -5318,15 +5320,49 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR64RegClass.contains(DestReg) &&
AArch64::FPR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ // This instruction is reading and writing Q registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegQ, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
- if (Subtarget.hasZeroCycleRegMoveFPR64() &&
- !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ // This instruction is reading and writing Q registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegQ, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32()) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
&AArch64::FPR64RegClass);
@@ -5348,8 +5384,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
- if (Subtarget.hasZeroCycleRegMoveFPR64() &&
- !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ // This instruction is reading and writing Q registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegQ, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32()) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
&AArch64::FPR64RegClass);
@@ -5375,8 +5427,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
- if (Subtarget.hasZeroCycleRegMoveFPR64() &&
- !Subtarget.hasZeroCycleRegMoveFPR32()) {
+ if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR64() && Subtarget.isNeonAvailable()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR128RegClass);
+ MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR128RegClass);
+ // This instruction is reading and writing Q registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegQ, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcRegQ, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+ !Subtarget.hasZeroCycleRegMoveFPR32()) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
&AArch64::FPR64RegClass);
@@ -5403,8 +5471,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR64 and FPR64.
if (AArch64::FPR64RegClass.contains(DestReg) &&
AArch64::GPR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (AArch64::XZR == SrcReg) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::GPR64RegClass.contains(DestReg) &&
@@ -5416,8 +5488,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR32 and FPR32.
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::GPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (AArch64::WZR == SrcReg) {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::GPR32RegClass.contains(DestReg) &&
@@ -6652,7 +6728,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
if (MO.isReg() && MO.getReg().isVirtual())
MI = MRI.getUniqueVRegDef(MO.getReg());
// And it needs to be in the trace (otherwise, it won't have a depth).
- if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
+ if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
return false;
// Must only used by the user we combine with.
if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 89f88776d832..f0020a9a3c91 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -989,6 +989,17 @@ def AArch64fcvtxnv: PatFrags<(ops node:$Rn),
[(int_aarch64_neon_fcvtxn node:$Rn),
(AArch64fcvtxn_n node:$Rn)]>;
+def AArch64fcvtzs_half : SDNode<"AArch64ISD::FCVTZS_HALF", SDTFPExtendOp>;
+def AArch64fcvtzu_half : SDNode<"AArch64ISD::FCVTZU_HALF", SDTFPExtendOp>;
+def AArch64fcvtas_half : SDNode<"AArch64ISD::FCVTAS_HALF", SDTFPExtendOp>;
+def AArch64fcvtau_half : SDNode<"AArch64ISD::FCVTAU_HALF", SDTFPExtendOp>;
+def AArch64fcvtms_half : SDNode<"AArch64ISD::FCVTMS_HALF", SDTFPExtendOp>;
+def AArch64fcvtmu_half : SDNode<"AArch64ISD::FCVTMU_HALF", SDTFPExtendOp>;
+def AArch64fcvtns_half : SDNode<"AArch64ISD::FCVTNS_HALF", SDTFPExtendOp>;
+def AArch64fcvtnu_half : SDNode<"AArch64ISD::FCVTNU_HALF", SDTFPExtendOp>;
+def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>;
+def AArch64fcvtpu_half : SDNode<"AArch64ISD::FCVTPU_HALF", SDTFPExtendOp>;
+
//def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
// Vector immediate ops
@@ -2155,7 +2166,7 @@ let Predicates = [HasPAuth] in {
i64imm:$Disc, GPR64:$AddrDisc),
[], "$AuthVal = $Val">, Sched<[WriteI, ReadI]> {
let isCodeGenOnly = 1;
- let hasSideEffects = 0;
+ let hasSideEffects = 1;
let mayStore = 0;
let mayLoad = 0;
let Size = 32;
@@ -2660,13 +2671,17 @@ defm ADD : AddSub<0, "add", "sub", add>;
defm SUB : AddSub<1, "sub", "add">;
def : InstAlias<"mov $dst, $src",
- (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+ (ADDWri GPR32sponly:$dst, GPR32sp:$src,
+ (addsub_shifted_imm32 0, 0))>;
def : InstAlias<"mov $dst, $src",
- (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+ (ADDWri GPR32sp:$dst, GPR32sponly:$src,
+ (addsub_shifted_imm32 0, 0))>;
def : InstAlias<"mov $dst, $src",
- (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+ (ADDXri GPR64sponly:$dst, GPR64sp:$src,
+ (addsub_shifted_imm64 0, 0))>;
def : InstAlias<"mov $dst, $src",
- (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+ (ADDXri GPR64sp:$dst, GPR64sponly:$src,
+ (addsub_shifted_imm64 0, 0))>;
defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">;
defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">;
@@ -2726,19 +2741,31 @@ def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
(ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
}
-def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
-def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src",
+ (SUBWrs GPR32:$dst, WZR,
+ (arith_shifted_reg32 GPR32:$src, 0)), 3>;
+def : InstAlias<"neg $dst, $src",
+ (SUBXrs GPR64:$dst, XZR,
+ (arith_shifted_reg64 GPR64:$src, 0)), 3>;
def : InstAlias<"neg $dst, $src$shift",
- (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+ (SUBWrs GPR32:$dst, WZR,
+ (arith_shifted_reg32 GPR32:$src, arith_shift32:$shift)), 2>;
def : InstAlias<"neg $dst, $src$shift",
- (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
-
-def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
-def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+ (SUBXrs GPR64:$dst, XZR,
+ (arith_shifted_reg64 GPR64:$src, arith_shift64:$shift)), 2>;
+
+def : InstAlias<"negs $dst, $src",
+ (SUBSWrs GPR32:$dst, WZR,
+ (arith_shifted_reg32 GPR32:$src, 0)), 3>;
+def : InstAlias<"negs $dst, $src",
+ (SUBSXrs GPR64:$dst, XZR,
+ (arith_shifted_reg64 GPR64:$src, 0)), 3>;
def : InstAlias<"negs $dst, $src$shift",
- (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+ (SUBSWrs GPR32:$dst, WZR,
+ (arith_shifted_reg32 GPR32:$src, arith_shift32:$shift)), 2>;
def : InstAlias<"negs $dst, $src$shift",
- (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+ (SUBSXrs GPR64:$dst, XZR,
+ (arith_shifted_reg64 GPR64:$src, arith_shift64:$shift)), 2>;
// Unsigned/Signed divide
@@ -3165,16 +3192,26 @@ defm ORN : LogicalReg<0b01, 1, "orn",
BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
defm ORR : LogicalReg<0b01, 0, "orr", or>;
-def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
-def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
-
-def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
-def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+def : InstAlias<"mov $dst, $src",
+ (ORRWrs GPR32:$dst, WZR,
+ (logical_shifted_reg32 GPR32:$src, 0)), 2>;
+def : InstAlias<"mov $dst, $src",
+ (ORRXrs GPR64:$dst, XZR,
+ (logical_shifted_reg64 GPR64:$src, 0)), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm",
+ (ORNWrs GPR32:$Wd, WZR,
+ (logical_shifted_reg32 GPR32:$Wm, 0)), 3>;
+def : InstAlias<"mvn $Xd, $Xm",
+ (ORNXrs GPR64:$Xd, XZR,
+ (logical_shifted_reg64 GPR64:$Xm, 0)), 3>;
def : InstAlias<"mvn $Wd, $Wm$sh",
- (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+ (ORNWrs GPR32:$Wd, WZR,
+ (logical_shifted_reg32 GPR32:$Wm, logical_shift32:$sh)), 2>;
def : InstAlias<"mvn $Xd, $Xm$sh",
- (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+ (ORNXrs GPR64:$Xd, XZR,
+ (logical_shifted_reg64 GPR64:$Xm, logical_shift64:$sh)), 2>;
def : InstAlias<"tst $src1, $src2",
(ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
@@ -3182,14 +3219,18 @@ def : InstAlias<"tst $src1, $src2",
(ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
def : InstAlias<"tst $src1, $src2",
- (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+ (ANDSWrs WZR, GPR32:$src1,
+ (logical_shifted_reg32 GPR32:$src2, 0)), 3>;
def : InstAlias<"tst $src1, $src2",
- (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+ (ANDSXrs XZR, GPR64:$src1,
+ (logical_shifted_reg64 GPR64:$src2, 0)), 3>;
def : InstAlias<"tst $src1, $src2$sh",
- (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+ (ANDSWrs WZR, GPR32:$src1,
+ (logical_shifted_reg32 GPR32:$src2, logical_shift32:$sh)), 2>;
def : InstAlias<"tst $src1, $src2$sh",
- (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+ (ANDSXrs XZR, GPR64:$src1,
+ (logical_shifted_reg64 GPR64:$src2, logical_shift64:$sh)), 2>;
def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
@@ -4710,6 +4751,26 @@ let Predicates = [IsLE] in {
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
}
+// truncstorei32 of f64 bitcasted to i64
+def : Pat<(truncstorei32 (i64 (bitconvert (f64 FPR64:$Rt))), (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+ (STRSui (EXTRACT_SUBREG FPR64:$Rt, ssub), GPR64sp:$Rn, uimm12s4:$offset)>;
+
+// truncstorei16 of f64 bitcasted to i64
+def : Pat<(truncstorei16 (i64 (bitconvert (f64 FPR64:$Rt))), (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+ (STRHui (f16 (EXTRACT_SUBREG FPR64:$Rt, hsub)), GPR64sp:$Rn, uimm12s2:$offset)>;
+
+ // truncstorei16 of f32 bitcasted to i32
+def : Pat<(truncstorei16 (i32 (bitconvert (f32 FPR32:$Rt))), (am_indexed16 GPR64sp:$Rn, uimm12s2:$off)),
+ (STRHui (f16 (EXTRACT_SUBREG FPR32:$Rt, hsub)), GPR64sp:$Rn, uimm12s2:$off)>;
+
+ // truncstorei8 of f64 bitcasted to i64
+def : Pat<(truncstorei8 (i64 (bitconvert (f64 FPR64:$Rt))), (am_indexed8 GPR64sp:$Rn, uimm12s1:$off)),
+ (STRBui (aarch64mfp8 (EXTRACT_SUBREG FPR64:$Rt, bsub)), GPR64sp:$Rn, uimm12s1:$off)>;
+
+ // truncstorei8 of f32 bitcasted to i32
+def : Pat<(truncstorei8 (i32 (bitconvert (f32 FPR32:$Rt))), (am_indexed8 GPR64sp:$Rn, uimm12s1:$off)),
+ (STRBui (aarch64mfp8 (EXTRACT_SUBREG FPR32:$Rt, bsub)), GPR64sp:$Rn, uimm12s1:$off)>;
+
// truncstore i64
def : Pat<(truncstorei32 GPR64:$Rt,
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
@@ -6536,9 +6597,33 @@ defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar
defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
int_aarch64_neon_usqadd>;
+// f16 -> s16 conversions
+let Predicates = [HasFullFP16] in {
+ def : Pat<(i16(fp_to_sint_sat_gi f16:$Rn)), (FCVTZSv1f16 f16:$Rn)>;
+ def : Pat<(i16(fp_to_uint_sat_gi f16:$Rn)), (FCVTZUv1f16 f16:$Rn)>;
+}
+
def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
(CMLTv1i64rz V64:$Rn)>;
+// f16 -> i16 conversions leave the bit pattern in a f32
+class F16ToI16ScalarPat<SDNode cvt_isd, BaseSIMDTwoScalar instr>
+ : Pat<(f32 (cvt_isd (f16 FPR16:$Rn))),
+ (f32 (SUBREG_TO_REG (i64 0), (instr FPR16:$Rn), hsub))>;
+
+let Predicates = [HasFullFP16] in {
+def : F16ToI16ScalarPat<AArch64fcvtzs_half, FCVTZSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtzu_half, FCVTZUv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtas_half, FCVTASv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtau_half, FCVTAUv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtms_half, FCVTMSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtmu_half, FCVTMUv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtns_half, FCVTNSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtnu_half, FCVTNUv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtps_half, FCVTPSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtpu_half, FCVTPUv1f16>;
+}
+
// Round FP64 to BF16.
let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in
def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))),
@@ -6641,20 +6726,24 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
+let HasOneUse = 1 in {
+def any_fp_to_sint_oneuse: PatFrag<(ops node:$src0), (any_fp_to_sint $src0)>;
+def any_fp_to_uint_oneuse: PatFrag<(ops node:$src0), (any_fp_to_uint $src0)>;
+}
let Predicates = [HasNEONandIsSME2p2StreamingSafe] in {
-def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
+def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint_oneuse f64:$Rn)))),
(SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
-def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
+def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint_oneuse f32:$Rn)))),
(SCVTFv1i32 (i32 (FCVTZSv1i32 f32:$Rn)))>;
-def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
+def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint_oneuse f64:$Rn)))),
(UCVTFv1i64 (i64 (FCVTZUv1i64 f64:$Rn)))>;
-def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
+def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint_oneuse f32:$Rn)))),
(UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
-def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
+def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint_oneuse f16:$Rn)))),
(SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
-def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
+def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint_oneuse f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}
@@ -8234,6 +8323,29 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
}
+// SABA patterns for add(x, abs(y)) -> saba(x, y, 0)
+def : Pat<(v8i8 (add V64:$Vn, (abs V64:$Vm))),
+ (SABAv8i8 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v4i16 (add V64:$Vn, (abs V64:$Vm))),
+ (SABAv4i16 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v2i32 (add V64:$Vn, (abs V64:$Vm))),
+ (SABAv2i32 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v16i8 (add V128:$Vn, (abs V128:$Vm))),
+ (SABAv16i8 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
+def : Pat<(v8i16 (add V128:$Vn, (abs V128:$Vm))),
+ (SABAv8i16 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
+def : Pat<(v4i32 (add V128:$Vn, (abs V128:$Vm))),
+ (SABAv4i32 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
+
+// SABAL patterns for add(x, zext(abs(y))) -> sabal(x, y, 0)
+def : Pat<(v8i16 (add V128:$Vn, (zext (abs (v8i8 V64:$Vm))))),
+ (SABALv8i8_v8i16 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v4i32 (add V128:$Vn, (zext (abs (v4i16 V64:$Vm))))),
+ (SABALv4i16_v4i32 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v2i64 (add V128:$Vn, (zext (abs (v2i32 V64:$Vm))))),
+ (SABALv2i32_v2i64 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+
+
//----------------------------------------------------------------------------
// AdvSIMD indexed element
//----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 1fde87e65a34..993cff112ba8 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -228,9 +228,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// on function entry to record the initial pstate of a function.
Register PStateSMReg = MCRegister::NoRegister;
- // true if PStateSMReg is used.
- bool PStateSMRegUsed = false;
-
// Has the PNReg used to build PTRUE instruction.
// The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
unsigned PredicateRegForFillSpill = 0;
@@ -238,6 +235,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// Holds the SME function attributes (streaming mode, ZA/ZT0 state).
SMEAttrs SMEFnAttrs;
+ // Holds the TPIDR2 block if allocated early (for Windows/stack probes
+ // support).
+ Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;
+
// Note: The following properties are only used for the old SME ABI lowering:
/// The frame-index for the TPIDR2 object used for lazy saves.
TPIDR2Object TPIDR2;
@@ -256,6 +257,14 @@ public:
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
const override;
+ void setEarlyAllocSMESaveBuffer(Register Ptr) {
+ EarlyAllocSMESaveBuffer = Ptr;
+ }
+
+ Register getEarlyAllocSMESaveBuffer() const {
+ return EarlyAllocSMESaveBuffer;
+ }
+
// Old SME ABI lowering state getters/setters:
Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
@@ -273,9 +282,6 @@ public:
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
- unsigned isPStateSMRegUsed() const { return PStateSMRegUsed; };
- void setPStateSMRegUsed(bool Used = true) { PStateSMRegUsed = Used; };
-
bool isSVECC() const { return IsSVECC; };
void setIsSVECC(bool s) { IsSVECC = s; };
diff --git a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
index ff7a0d1faedf..f4a7f774d477 100644
--- a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -237,8 +237,8 @@ static bool isAddressLdStPair(const MachineInstr *FirstMI,
}
/// Compare and conditional select.
-static bool isCCSelectPair(const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
+static bool isCmpCSelPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
// 32 bits
if (SecondMI.getOpcode() == AArch64::CSELWr) {
// Assume the 1st instr to be a wildcard if it is unspecified.
@@ -279,6 +279,40 @@ static bool isCCSelectPair(const MachineInstr *FirstMI,
return false;
}
+/// Compare and cset.
+static bool isCmpCSetPair(const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ if ((SecondMI.getOpcode() == AArch64::CSINCWr &&
+ SecondMI.getOperand(1).getReg() == AArch64::WZR &&
+ SecondMI.getOperand(2).getReg() == AArch64::WZR) ||
+ (SecondMI.getOpcode() == AArch64::CSINCXr &&
+ SecondMI.getOperand(1).getReg() == AArch64::XZR &&
+ SecondMI.getOperand(2).getReg() == AArch64::XZR)) {
+ // Assume the 1st instr to be a wildcard if it is unspecified.
+ if (FirstMI == nullptr)
+ return true;
+
+ if (FirstMI->definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
+ FirstMI->definesRegister(AArch64::XZR, /*TRI=*/nullptr))
+ switch (FirstMI->getOpcode()) {
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSXrs:
+ return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
+ case AArch64::SUBSWrx:
+ case AArch64::SUBSXrx:
+ case AArch64::SUBSXrx64:
+ return !AArch64InstrInfo::hasExtendedReg(*FirstMI);
+ case AArch64::SUBSWri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXri:
+ case AArch64::SUBSXrr:
+ return true;
+ }
+ }
+
+ return false;
+}
+
// Arithmetic and logic.
static bool isArithmeticLogicPair(const MachineInstr *FirstMI,
const MachineInstr &SecondMI) {
@@ -465,7 +499,9 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
return true;
if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI))
return true;
- if (ST.hasFuseCCSelect() && isCCSelectPair(FirstMI, SecondMI))
+ if (ST.hasFuseCmpCSel() && isCmpCSelPair(FirstMI, SecondMI))
+ return true;
+ if (ST.hasFuseCmpCSet() && isCmpCSetPair(FirstMI, SecondMI))
return true;
if (ST.hasFuseArithmeticLogic() && isArithmeticLogicPair(FirstMI, SecondMI))
return true;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 42eaeca906e6..81f5d075729d 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -134,6 +134,8 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
@@ -146,6 +148,8 @@ def TuneA78AE : SubtargetFeature<"a78ae", "ARMProcFamily",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
@@ -158,6 +162,8 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
@@ -169,6 +175,8 @@ def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -181,6 +189,8 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
FeatureCmpBccFusion,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -191,6 +201,8 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720",
FeatureCmpBccFusion,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -201,6 +213,8 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720",
FeatureCmpBccFusion,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -212,6 +226,8 @@ def TuneA725 : SubtargetFeature<"cortex-a725", "ARMProcFamily",
FeatureCmpBccFusion,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -262,6 +278,8 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
"Cortex-X4 ARM processors", [
FeatureALULSLFast,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureFuseAES,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -273,6 +291,8 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
"CortexX925", "Cortex-X925 ARM processors",[
FeatureALULSLFast,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureFuseAES,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -321,7 +341,11 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureFuseAES, FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing,
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128,
FeatureZCZeroingFPWorkaround]>;
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -334,7 +358,11 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing]>;
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
"Apple A11", [
@@ -346,7 +374,11 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing]>;
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
"Apple A12", [
@@ -358,7 +390,11 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing]>;
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
"Apple A13", [
@@ -370,7 +406,11 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing]>;
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
"Apple A14", [
@@ -382,12 +422,16 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureFuseAddress,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
- FeatureFuseCCSelect,
+ FeatureFuseCmpCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing]>;
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
@@ -399,12 +443,16 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureFuseAdrpAdd,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
- FeatureFuseCCSelect,
+ FeatureFuseCmpCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing]>;
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
@@ -416,12 +464,16 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureFuseAdrpAdd,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
- FeatureFuseCCSelect,
+ FeatureFuseCmpCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing]>;
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
@@ -433,12 +485,16 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureFuseAdrpAdd,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
- FeatureFuseCCSelect,
+ FeatureFuseCmpCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing]>;
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
@@ -450,12 +506,15 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureFuseAdrpAdd,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
- FeatureFuseCCSelect,
+ FeatureFuseCmpCSel,
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMoveGPR64,
- FeatureZCZeroing
- ]>;
+ FeatureZCRegMoveFPR128,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
@@ -463,7 +522,7 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeatureForce32BitJumpTables,
FeatureFuseAddress,
FeatureFuseAES,
- FeatureFuseCCSelect,
+ FeatureFuseCmpCSel,
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
FeatureStorePairSuppress,
@@ -481,19 +540,21 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
FeatureFuseAddress,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
- FeatureFuseCCSelect,
+ FeatureFuseCmpCSel,
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureALULSLFast,
FeaturePostRAScheduler,
- FeatureZCZeroing]>;
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64]>;
def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
"Qualcomm Kryo processors", [
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureZCZeroing,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
FeatureALULSLFast,
FeatureStorePairSuppress]>;
@@ -501,7 +562,8 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
"Qualcomm Falkor processors", [
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureZCZeroing,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
FeatureALULSLFast,
FeatureSlowSTRQro]>;
@@ -526,6 +588,8 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
"Neoverse N2 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -537,6 +601,8 @@ def TuneNeoverseN3 : SubtargetFeature<"neoversen3", "ARMProcFamily", "NeoverseN3
FeaturePostRAScheduler,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -553,6 +619,8 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
"Neoverse V1 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureAddrLSLSlow14,
FeatureALULSLFast,
FeaturePostRAScheduler,
@@ -565,6 +633,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
FeatureFuseAES,
FeatureCmpBccFusion,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
@@ -578,6 +648,8 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3
FeatureFuseAES,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeatureAvoidLDAPUR,
@@ -588,6 +660,8 @@ def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "Neover
FeatureFuseAES,
FeatureALULSLFast,
FeatureFuseAdrpAdd,
+ FeatureFuseCmpCSel,
+ FeatureFuseCmpCSet,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeatureAvoidLDAPUR,
@@ -597,7 +671,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
"Qualcomm Saphira processors", [
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureZCZeroing,
+ FeatureZCZeroingGPR32,
+ FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
FeatureALULSLFast]>;
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
new file mode 100644
index 000000000000..af424987b8dd
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -0,0 +1,794 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64PrologueEpilogue.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/CFIInstBuilder.h"
+#include "llvm/MC/MCContext.h"
+
+#define DEBUG_TYPE "frame-info"
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+namespace llvm {
+
+AArch64PrologueEmitter::AArch64PrologueEmitter(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ const AArch64FrameLowering &AFL)
+ : MF(MF), MBB(MBB), F(MF.getFunction()), MFI(MF.getFrameInfo()),
+ Subtarget(MF.getSubtarget<AArch64Subtarget>()), AFL(AFL),
+ RegInfo(*Subtarget.getRegisterInfo()) {
+ TII = Subtarget.getInstrInfo();
+ AFI = MF.getInfo<AArch64FunctionInfo>();
+
+ EmitCFI = AFI->needsDwarfUnwindInfo(MF);
+ EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
+ HasFP = AFL.hasFP(MF);
+ NeedsWinCFI = AFL.needsWinCFI(MF);
+ IsFunclet = MBB.isEHFuncletEntry();
+ HomPrologEpilog = AFL.homogeneousPrologEpilog(MF);
+
+#ifndef NDEBUG
+ collectBlockLiveins();
+#endif
+}
+
+#ifndef NDEBUG
+/// Collect live registers from the end of \p MI's parent up to (including) \p
+/// MI in \p LiveRegs.
+static void getLivePhysRegsUpTo(MachineInstr &MI, const TargetRegisterInfo &TRI,
+ LivePhysRegs &LiveRegs) {
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ LiveRegs.addLiveOuts(MBB);
+ for (const MachineInstr &MI :
+ reverse(make_range(MI.getIterator(), MBB.instr_end())))
+ LiveRegs.stepBackward(MI);
+}
+
+void AArch64PrologueEmitter::collectBlockLiveins() {
+ // Collect live register from the end of MBB up to the start of the existing
+ // frame setup instructions.
+ PrologueEndI = MBB.begin();
+ while (PrologueEndI != MBB.end() &&
+ PrologueEndI->getFlag(MachineInstr::FrameSetup))
+ ++PrologueEndI;
+
+ if (PrologueEndI != MBB.end()) {
+ getLivePhysRegsUpTo(*PrologueEndI, RegInfo, LiveRegs);
+ // Ignore registers used for stack management for now.
+ LiveRegs.removeReg(AArch64::SP);
+ LiveRegs.removeReg(AArch64::X19);
+ LiveRegs.removeReg(AArch64::FP);
+ LiveRegs.removeReg(AArch64::LR);
+
+ // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
+ // This is necessary to spill VG if required where SVE is unavailable, but
+ // X0 is preserved around this call.
+ if (AFL.requiresGetVGCall(MF))
+ LiveRegs.removeReg(AArch64::X0);
+ }
+}
+
+void AArch64PrologueEmitter::verifyPrologueClobbers() const {
+ if (PrologueEndI == MBB.end())
+ return;
+ // Check if any of the newly instructions clobber any of the live registers.
+ for (MachineInstr &MI :
+ make_range(MBB.instr_begin(), PrologueEndI->getIterator())) {
+ for (auto &Op : MI.operands())
+ if (Op.isReg() && Op.isDef())
+ assert(!LiveRegs.contains(Op.getReg()) &&
+ "live register clobbered by inserted prologue instructions");
+ }
+}
+#endif
+
+void AArch64PrologueEmitter::determineLocalsStackSize(
+ uint64_t StackSize, uint64_t PrologueSaveSize) {
+ AFI->setLocalStackSize(StackSize - PrologueSaveSize);
+ CombineSPBump = AFL.shouldCombineCSRLocalStackBump(MF, StackSize);
+}
+
+void AArch64PrologueEmitter::emitPrologue() {
+ const MachineBasicBlock::iterator PrologueBeginI = MBB.begin();
+ const MachineBasicBlock::iterator EndI = MBB.end();
+
+ // At this point, we're going to decide whether or not the function uses a
+ // redzone. In most cases, the function doesn't have a redzone so let's
+ // assume that's false and set it to true in the case that there's a redzone.
+ AFI->setHasRedZone(false);
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ if (AFI->shouldSignReturnAddress(MF)) {
+ // If pac-ret+leaf is in effect, PAUTH_PROLOGUE pseudo instructions
+ // are inserted by emitPacRetPlusLeafHardening().
+ if (!AFL.shouldSignReturnAddressEverywhere(MF)) {
+ BuildMI(MBB, PrologueBeginI, DL, TII->get(AArch64::PAUTH_PROLOGUE))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ // AArch64PointerAuth pass will insert SEH_PACSignLR
+ HasWinCFI |= NeedsWinCFI;
+ }
+
+ if (AFI->needsShadowCallStackPrologueEpilogue(MF)) {
+ emitShadowCallStackPrologue(PrologueBeginI, DL);
+ HasWinCFI |= NeedsWinCFI;
+ }
+
+ if (EmitCFI && AFI->isMTETagged())
+ BuildMI(MBB, PrologueBeginI, DL, TII->get(AArch64::EMITMTETAGGED))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // We signal the presence of a Swift extended frame to external tools by
+ // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
+ // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
+ // bits so that is still true.
+ if (HasFP && AFI->hasSwiftAsyncContext())
+ emitSwiftAsyncContextFramePointer(PrologueBeginI, DL);
+
+ // All calls are tail calls in GHC calling conv, and functions have no
+ // prologue/epilogue.
+ if (MF.getFunction().getCallingConv() == CallingConv::GHC)
+ return;
+
+ // Set tagged base pointer to the requested stack slot. Ideally it should
+ // match SP value after prologue.
+ if (std::optional<int> TBPI = AFI->getTaggedBasePointerIndex())
+ AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
+ else
+ AFI->setTaggedBasePointerOffset(MFI.getStackSize());
+
+ // getStackSize() includes all the locals in its size calculation. We don't
+ // include these locals when computing the stack size of a funclet, as they
+ // are allocated in the parent's stack frame and accessed via the frame
+ // pointer from the funclet. We only save the callee saved registers in the
+ // funclet, which are really the callee saved registers of the parent
+ // function, including the funclet.
+ int64_t NumBytes =
+ IsFunclet ? AFL.getWinEHFuncletFrameSize(MF) : MFI.getStackSize();
+ if (!AFI->hasStackFrame() && !AFL.windowsRequiresStackProbe(MF, NumBytes))
+ return emitEmptyStackFramePrologue(NumBytes, PrologueBeginI, DL);
+
+ bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
+ unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
+
+ // Windows unwind can't represent the required stack adjustments if we have
+ // both SVE callee-saves and dynamic stack allocations, and the frame
+ // pointer is before the SVE spills. The allocation of the frame pointer
+ // must be the last instruction in the prologue so the unwinder can restore
+ // the stack pointer correctly. (And there isn't any unwind opcode for
+ // `addvl sp, x29, -17`.)
+ //
+ // Because of this, we do spills in the opposite order on Windows: first SVE,
+ // then GPRs. The main side-effect of this is that it makes accessing
+ // parameters passed on the stack more expensive.
+ //
+ // We could consider rearranging the spills for simpler cases.
+ bool FPAfterSVECalleeSaves =
+ Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
+
+ if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
+ reportFatalUsageError("SME hazard padding is not supported on Windows");
+
+ auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
+ // All of the remaining stack allocations are for locals.
+ determineLocalsStackSize(NumBytes, PrologueSaveSize);
+
+ MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI;
+ if (FPAfterSVECalleeSaves) {
+ // If we're doing SVE saves first, we need to immediately allocate space
+ // for fixed objects, then space for the SVE callee saves.
+ //
+ // Windows unwind requires that the scalable size is a multiple of 16;
+ // that's handled when the callee-saved size is computed.
+ auto SaveSize =
+ StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) +
+ StackOffset::getFixed(FixedObject);
+ AFL.allocateStackSpace(MBB, PrologueBeginI, 0, SaveSize, NeedsWinCFI,
+ &HasWinCFI,
+ /*EmitCFI=*/false, StackOffset{},
+ /*FollowupAllocs=*/true);
+ NumBytes -= FixedObject;
+
+ // Now allocate space for the GPR callee saves.
+ MachineBasicBlock::iterator MBBI = PrologueBeginI;
+ while (MBBI != EndI && AFL.isSVECalleeSave(MBBI))
+ ++MBBI;
+ FirstGPRSaveI = AFL.convertCalleeSaveRestoreToSPPrePostIncDec(
+ MBB, MBBI, DL, TII, -AFI->getCalleeSavedStackSize(), NeedsWinCFI,
+ &HasWinCFI, EmitAsyncCFI);
+ NumBytes -= AFI->getCalleeSavedStackSize();
+ } else if (CombineSPBump) {
+ assert(!AFL.getSVEStackSize(MF) && "Cannot combine SP bump with SVE");
+ emitFrameOffset(MBB, PrologueBeginI, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(-NumBytes), TII,
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
+ EmitAsyncCFI);
+ NumBytes = 0;
+ } else if (HomPrologEpilog) {
+ // Stack has been already adjusted.
+ NumBytes -= PrologueSaveSize;
+ } else if (PrologueSaveSize != 0) {
+ FirstGPRSaveI = AFL.convertCalleeSaveRestoreToSPPrePostIncDec(
+ MBB, PrologueBeginI, DL, TII, -PrologueSaveSize, NeedsWinCFI,
+ &HasWinCFI, EmitAsyncCFI);
+ NumBytes -= PrologueSaveSize;
+ }
+ assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+ // Move past the saves of the callee-saved registers, fixing up the offsets
+ // and pre-inc if we decided to combine the callee-save and local stack
+ // pointer bump above.
+ auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+ MachineBasicBlock::iterator AfterGPRSavesI = FirstGPRSaveI;
+ while (AfterGPRSavesI != EndI &&
+ AfterGPRSavesI->getFlag(MachineInstr::FrameSetup) &&
+ !AFL.isSVECalleeSave(AfterGPRSavesI)) {
+ if (CombineSPBump &&
+ // Only fix-up frame-setup load/store instructions.
+ (!AFL.requiresSaveVG(MF) || !AFL.isVGInstruction(AfterGPRSavesI, TLI)))
+ AFL.fixupCalleeSaveRestoreStackOffset(
+ *AfterGPRSavesI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI);
+ ++AfterGPRSavesI;
+ }
+
+ // For funclets the FP belongs to the containing function. Only set up FP if
+ // we actually need to.
+ if (!IsFunclet && HasFP)
+ emitFramePointerSetup(AfterGPRSavesI, DL, FixedObject);
+
+ // Now emit the moves for whatever callee saved regs we have (including FP,
+ // LR if those are saved). Frame instructions for SVE register are emitted
+ // later, after the instruction which actually save SVE regs.
+ if (EmitAsyncCFI)
+ emitCalleeSavedGPRLocations(AfterGPRSavesI);
+
+ // Alignment is required for the parent frame, not the funclet
+ const bool NeedsRealignment =
+ NumBytes && !IsFunclet && RegInfo.hasStackRealignment(MF);
+ const int64_t RealignmentPadding =
+ (NeedsRealignment && MFI.getMaxAlign() > Align(16))
+ ? MFI.getMaxAlign().value() - 16
+ : 0;
+
+ if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
+ emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
+
+ StackOffset SVEStackSize = AFL.getSVEStackSize(MF);
+ StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
+ MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI;
+
+ StackOffset CFAOffset =
+ StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
+
+ // Process the SVE callee-saves to determine what space needs to be
+ // allocated.
+ MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
+ if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+ LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
+ << "\n");
+ SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
+ SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
+ // Find callee save instructions in frame.
+ // Note: With FPAfterSVECalleeSaves the callee saves have already been
+ // allocated.
+ if (!FPAfterSVECalleeSaves) {
+ MachineBasicBlock::iterator CalleeSavesBegin = AfterGPRSavesI;
+ assert(AFL.isSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
+ while (AFL.isSVECalleeSave(AfterSVESavesI) &&
+ AfterSVESavesI != MBB.getFirstTerminator())
+ ++AfterSVESavesI;
+ CalleeSavesEnd = AfterSVESavesI;
+
+ StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
+ // Allocate space for the callee saves (if any).
+ AFL.allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize,
+ false, nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
+ MFI.hasVarSizedObjects() || LocalsSize);
+ }
+ }
+ CFAOffset += SVECalleeSavesSize;
+
+ if (EmitAsyncCFI)
+ emitCalleeSavedSVELocations(CalleeSavesEnd);
+
+ // Allocate space for the rest of the frame including SVE locals. Align the
+ // stack as necessary.
+ assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
+ "Cannot use redzone with stack realignment");
+ if (!AFL.canUseRedZone(MF)) {
+ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+ // the correct value here, as NumBytes also includes padding bytes,
+ // which shouldn't be counted here.
+ AFL.allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
+ SVELocalsSize + StackOffset::getFixed(NumBytes),
+ NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
+ CFAOffset, MFI.hasVarSizedObjects());
+ }
+
+ // If we need a base pointer, set it up here. It's whatever the value of the
+ // stack pointer is at this point. Any variable size objects will be allocated
+ // after this, so we can still use the base pointer to reference locals.
+ //
+ // FIXME: Clarify FrameSetup flags here.
+ // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+ // needed.
+ // For funclets the BP belongs to the containing function.
+ if (!IsFunclet && RegInfo.hasBasePointer(MF)) {
+ TII->copyPhysReg(MBB, AfterSVESavesI, DL, RegInfo.getBaseRegister(),
+ AArch64::SP, false);
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, AfterSVESavesI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ // The very last FrameSetup instruction indicates the end of prologue. Emit a
+ // SEH opcode indicating the prologue end.
+ if (NeedsWinCFI && HasWinCFI) {
+ BuildMI(MBB, AfterSVESavesI, DL, TII->get(AArch64::SEH_PrologEnd))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // SEH funclets are passed the frame pointer in X1. If the parent
+ // function uses the base register, then the base register is used
+ // directly, and is not retrieved from X1.
+ if (IsFunclet && F.hasPersonalityFn()) {
+ EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
+ if (isAsynchronousEHPersonality(Per)) {
+ BuildMI(MBB, AfterSVESavesI, DL, TII->get(TargetOpcode::COPY),
+ AArch64::FP)
+ .addReg(AArch64::X1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MBB.addLiveIn(AArch64::X1);
+ }
+ }
+
+ if (EmitCFI && !EmitAsyncCFI) {
+ if (HasFP) {
+ emitDefineCFAWithFP(AfterSVESavesI, FixedObject);
+ } else {
+ StackOffset TotalSize =
+ SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
+ CFIInstBuilder CFIBuilder(MBB, AfterSVESavesI, MachineInstr::FrameSetup);
+ CFIBuilder.insertCFIInst(
+ createDefCFA(RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP,
+ TotalSize, /*LastAdjustmentWasScalable=*/false));
+ }
+ emitCalleeSavedGPRLocations(AfterSVESavesI);
+ emitCalleeSavedSVELocations(AfterSVESavesI);
+ }
+}
+
+void AArch64PrologueEmitter::emitShadowCallStackPrologue(
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const {
+ // Shadow call stack prolog: str x30, [x18], #8
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXpost))
+ .addReg(AArch64::X18, RegState::Define)
+ .addReg(AArch64::LR)
+ .addReg(AArch64::X18)
+ .addImm(8)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // This instruction also makes x18 live-in to the entry block.
+ MBB.addLiveIn(AArch64::X18);
+
+ if (NeedsWinCFI)
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ if (EmitCFI) {
+ // Emit a CFI instruction that causes 8 to be subtracted from the value of
+ // x18 when unwinding past this frame.
+ static const char CFIInst[] = {
+ dwarf::DW_CFA_val_expression,
+ 18, // register
+ 2, // length
+ static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
+ static_cast<char>(-8) & 0x7f, // addend (sleb128)
+ };
+ CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
+ .buildEscape(StringRef(CFIInst, sizeof(CFIInst)));
+ }
+}
+
+void AArch64PrologueEmitter::emitSwiftAsyncContextFramePointer(
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const {
+ switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
+ case SwiftAsyncFramePointerMode::DeploymentBased:
+ if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
+ // The special symbol below is absolute and has a *value* that can be
+ // combined with the frame pointer to signal an extended frame.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
+ .addExternalSymbol("swift_async_extendedFramePointerFlags",
+ AArch64II::MO_GOT);
+ if (NeedsWinCFI) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlags(MachineInstr::FrameSetup);
+ HasWinCFI = true;
+ }
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
+ .addUse(AArch64::FP)
+ .addUse(AArch64::X16)
+ .addImm(Subtarget.isTargetILP32() ? 32 : 0);
+ if (NeedsWinCFI) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlags(MachineInstr::FrameSetup);
+ HasWinCFI = true;
+ }
+ break;
+ }
+ [[fallthrough]];
+
+ case SwiftAsyncFramePointerMode::Always:
+ // ORR x29, x29, #0x1000_0000_0000_0000
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
+ .addUse(AArch64::FP)
+ .addImm(0x1100)
+ .setMIFlag(MachineInstr::FrameSetup);
+ if (NeedsWinCFI) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlags(MachineInstr::FrameSetup);
+ HasWinCFI = true;
+ }
+ break;
+
+ case SwiftAsyncFramePointerMode::Never:
+ break;
+ }
+}
+
+void AArch64PrologueEmitter::emitEmptyStackFramePrologue(
+ int64_t NumBytes, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const {
+ assert(!HasFP && "unexpected function without stack frame but with FP");
+ assert(!AFL.getSVEStackSize(MF) &&
+ "unexpected function without stack frame but with SVE objects");
+ // All of the stack allocation is for locals.
+ AFI->setLocalStackSize(NumBytes);
+ if (!NumBytes) {
+ if (NeedsWinCFI && HasWinCFI) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ return;
+ }
+ // REDZONE: If the stack size is less than 128 bytes, we don't need
+ // to actually allocate.
+ if (AFL.canUseRedZone(MF)) {
+ AFI->setHasRedZone(true);
+ ++NumRedZoneFunctions;
+ } else {
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(-NumBytes), TII,
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ if (EmitCFI) {
+ // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+ MCSymbol *FrameLabel = MF.getContext().createTempSymbol();
+ // Encode the stack size of the leaf function.
+ CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
+ .buildDefCFAOffset(NumBytes, FrameLabel);
+ }
+ }
+
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+}
+
+void AArch64PrologueEmitter::emitFramePointerSetup(
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ unsigned FixedObject) {
+ int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
+ if (CombineSPBump)
+ FPOffset += AFI->getLocalStackSize();
+
+ if (AFI->hasSwiftAsyncContext()) {
+ // Before we update the live FP we have to ensure there's a valid (or
+ // null) asynchronous context in its slot just before FP in the frame
+ // record, so store it now.
+ const auto &Attrs = MF.getFunction().getAttributes();
+ bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
+ if (HaveInitialContext)
+ MBB.addLiveIn(AArch64::X22);
+ Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR;
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
+ .addUse(Reg)
+ .addUse(AArch64::SP)
+ .addImm(FPOffset - 8)
+ .setMIFlags(MachineInstr::FrameSetup);
+ if (NeedsWinCFI) {
+ // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded
+ // to multiple instructions, should be mutually-exclusive.
+ assert(Subtarget.getTargetTriple().getArchName() != "arm64e");
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlags(MachineInstr::FrameSetup);
+ HasWinCFI = true;
+ }
+ }
+
+ if (HomPrologEpilog) {
+ auto Prolog = MBBI;
+ --Prolog;
+ assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
+ Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
+ } else {
+ // Issue sub fp, sp, FPOffset or
+ // mov fp,sp when FPOffset is zero.
+ // Note: All stores of callee-saved registers are marked as "FrameSetup".
+ // This code marks the instruction(s) that set the FP also.
+ emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
+ StackOffset::getFixed(FPOffset), TII,
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ if (NeedsWinCFI && HasWinCFI) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+ .setMIFlag(MachineInstr::FrameSetup);
+ // After setting up the FP, the rest of the prolog doesn't need to be
+ // included in the SEH unwind info.
+ NeedsWinCFI = false;
+ }
+ }
+ if (EmitAsyncCFI)
+ emitDefineCFAWithFP(MBBI, FixedObject);
+}
+
+// Define the current CFA rule to use the provided FP.
+void AArch64PrologueEmitter::emitDefineCFAWithFP(
+ MachineBasicBlock::iterator MBBI, unsigned FixedObject) const {
+ const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const int OffsetToFirstCalleeSaveFromFP =
+ AFI->getCalleeSaveBaseToFrameRecordOffset() -
+ AFI->getCalleeSavedStackSize();
+ Register FramePtr = TRI->getFrameRegister(MF);
+ CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
+ .buildDefCFA(FramePtr, FixedObject - OffsetToFirstCalleeSaveFromFP);
+}
+
+void AArch64PrologueEmitter::emitWindowsStackProbe(
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t &NumBytes,
+ int64_t RealignmentPadding) const {
+ if (AFI->getSVECalleeSavedStackSize())
+ report_fatal_error("SVE callee saves not yet supported with stack probing");
+
+ // Find an available register to spill the value of X15 to, if X15 is being
+ // used already for nest.
+ unsigned X15Scratch = AArch64::NoRegister;
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ if (llvm::any_of(MBB.liveins(),
+ [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+ return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+ AArch64::X15, LiveIn.PhysReg);
+ })) {
+ X15Scratch = AFL.findScratchNonCalleeSaveRegister(&MBB, /*HasCall=*/true);
+ assert(X15Scratch != AArch64::NoRegister &&
+ (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
+#ifndef NDEBUG
+ LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
+#endif
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X15, RegState::Undef)
+ .addReg(AArch64::X15, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
+ // exceed this amount. We need to move at most 2^24 - 1 into x15.
+ // This is at most two instructions, MOVZ followed by MOVK.
+ // TODO: Fix to use multiple stack alloc unwind codes for stacks
+ // exceeding 256MB in size.
+ if (NumBytes >= (1 << 28))
+ report_fatal_error("Stack size cannot exceed 256MB for stack "
+ "unwinding purposes");
+
+ uint32_t LowNumWords = NumWords & 0xFFFF;
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
+ .addImm(LowNumWords)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlag(MachineInstr::FrameSetup);
+ if ((NumWords & 0xFFFF0000) != 0) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
+ .addReg(AArch64::X15)
+ .addImm((NumWords & 0xFFFF0000) >> 16) // High half
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ } else {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
+ .addImm(NumWords)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
+ const char *ChkStk = Subtarget.getChkStkName();
+ switch (MF.getTarget().getCodeModel()) {
+ case CodeModel::Tiny:
+ case CodeModel::Small:
+ case CodeModel::Medium:
+ case CodeModel::Kernel:
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+ .addExternalSymbol(ChkStk)
+ .addReg(AArch64::X15, RegState::Implicit)
+ .addReg(AArch64::X16,
+ RegState::Implicit | RegState::Define | RegState::Dead)
+ .addReg(AArch64::X17,
+ RegState::Implicit | RegState::Define | RegState::Dead)
+ .addReg(AArch64::NZCV,
+ RegState::Implicit | RegState::Define | RegState::Dead)
+ .setMIFlags(MachineInstr::FrameSetup);
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ break;
+ case CodeModel::Large:
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
+ .addReg(AArch64::X16, RegState::Define)
+ .addExternalSymbol(ChkStk)
+ .addExternalSymbol(ChkStk)
+ .setMIFlags(MachineInstr::FrameSetup);
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
+ .addReg(AArch64::X16, RegState::Kill)
+ .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
+ .addReg(AArch64::X16,
+ RegState::Implicit | RegState::Define | RegState::Dead)
+ .addReg(AArch64::X17,
+ RegState::Implicit | RegState::Define | RegState::Dead)
+ .addReg(AArch64::NZCV,
+ RegState::Implicit | RegState::Define | RegState::Dead)
+ .setMIFlags(MachineInstr::FrameSetup);
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ break;
+ }
+
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
+ .addReg(AArch64::SP, RegState::Kill)
+ .addReg(AArch64::X15, RegState::Kill)
+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
+ .setMIFlags(MachineInstr::FrameSetup);
+ if (NeedsWinCFI) {
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+ .addImm(NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ NumBytes = 0;
+
+ if (RealignmentPadding > 0) {
+ if (RealignmentPadding >= 4096) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm))
+ .addReg(AArch64::X16, RegState::Define)
+ .addImm(RealignmentPadding)
+ .setMIFlags(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXrx64), AArch64::X15)
+ .addReg(AArch64::SP)
+ .addReg(AArch64::X16, RegState::Kill)
+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X15)
+ .addReg(AArch64::SP)
+ .addImm(RealignmentPadding)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+ .addReg(AArch64::X15, RegState::Kill)
+ .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64));
+ AFI->setStackRealigned(true);
+
+ // No need for SEH instructions here; if we're realigning the stack,
+ // we've set a frame pointer and already finished the SEH prologue.
+ assert(!NeedsWinCFI);
+ }
+ if (X15Scratch != AArch64::NoRegister) {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
+ .addReg(AArch64::XZR)
+ .addReg(X15Scratch, RegState::Undef)
+ .addReg(X15Scratch, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+}
+
+void AArch64PrologueEmitter::emitCalleeSavedGPRLocations(
+ MachineBasicBlock::iterator MBBI) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ if (CSI.empty())
+ return;
+
+ CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
+ for (const auto &Info : CSI) {
+ unsigned FrameIdx = Info.getFrameIdx();
+ if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
+ continue;
+
+ assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
+ int64_t Offset = MFI.getObjectOffset(FrameIdx) - AFL.getOffsetOfLocalArea();
+ CFIBuilder.buildOffset(Info.getReg(), Offset);
+ }
+}
+
+void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
+ MachineBasicBlock::iterator MBBI) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ if (CSI.empty())
+ return;
+
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+ CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
+
+ std::optional<int64_t> IncomingVGOffsetFromDefCFA;
+ if (AFL.requiresSaveVG(MF)) {
+ auto IncomingVG = *find_if(
+ reverse(CSI), [](auto &Info) { return Info.getReg() == AArch64::VG; });
+ IncomingVGOffsetFromDefCFA = MFI.getObjectOffset(IncomingVG.getFrameIdx()) -
+ AFL.getOffsetOfLocalArea();
+ }
+
+ for (const auto &Info : CSI) {
+ if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector)
+ continue;
+
+ // Not all unwinders may know about SVE registers, so assume the lowest
+ // common denominator.
+ assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
+ MCRegister Reg = Info.getReg();
+ if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
+ continue;
+
+ StackOffset Offset =
+ StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
+ StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
+
+ CFIBuilder.insertCFIInst(
+ createCFAOffset(TRI, Reg, Offset, IncomingVGOffsetFromDefCFA));
+ }
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
new file mode 100644
index 000000000000..94029ede60c7
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -0,0 +1,111 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declaration of the AArch64PrologueEmitter class,
+/// which is is used to emit the prologue on AArch64.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PROLOGUEEPILOGUE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PROLOGUEEPILOGUE_H
+
+#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class AArch64Subtarget;
+class AArch64FunctionInfo;
+class AArch64FrameLowering;
+
+/// A helper class for emitting the prologue. Substantial new functionality
+/// should be factored into a new method. Where possible "emit*" methods should
+/// be const, and any flags that change how the prologue is emitted should be
+/// set in the constructor.
+class AArch64PrologueEmitter {
+public:
+ AArch64PrologueEmitter(MachineFunction &MF, MachineBasicBlock &MBB,
+ const AArch64FrameLowering &AFL);
+
+ /// Emit the prologue.
+ void emitPrologue();
+
+ ~AArch64PrologueEmitter() {
+ MF.setHasWinCFI(HasWinCFI);
+#ifndef NDEBUG
+ verifyPrologueClobbers();
+#endif
+ }
+
+private:
+ void emitShadowCallStackPrologue(MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const;
+
+ void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const;
+
+ void emitEmptyStackFramePrologue(int64_t NumBytes,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const;
+
+ void emitFramePointerSetup(MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned FixedObject);
+
+ void emitDefineCFAWithFP(MachineBasicBlock::iterator MBBI,
+ unsigned FixedObject) const;
+
+ void emitWindowsStackProbe(MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t &NumBytes,
+ int64_t RealignmentPadding) const;
+
+ void emitCalleeSavedGPRLocations(MachineBasicBlock::iterator MBBI) const;
+ void emitCalleeSavedSVELocations(MachineBasicBlock::iterator MBBI) const;
+
+ void determineLocalsStackSize(uint64_t StackSize, uint64_t PrologueSaveSize);
+
+ MachineFunction &MF;
+ MachineBasicBlock &MBB;
+
+ const Function &F;
+ const MachineFrameInfo &MFI;
+ const AArch64Subtarget &Subtarget;
+ const AArch64FrameLowering &AFL;
+ const AArch64RegisterInfo &RegInfo;
+
+#ifndef NDEBUG
+ mutable LivePhysRegs LiveRegs{RegInfo};
+ MachineBasicBlock::iterator PrologueEndI;
+
+ void collectBlockLiveins();
+ void verifyPrologueClobbers() const;
+#endif
+
+ // Prologue flags. These generally should not change outside of the
+ // constructor. Two exceptions are "CombineSPBump" which is set in
+ // determineLocalsStackSize, and "NeedsWinCFI" which is set in
+ // emitFramePointerSetup.
+ bool EmitCFI = false;
+ bool EmitAsyncCFI = false;
+ bool HasFP = false;
+ bool IsFunclet = false;
+ bool CombineSPBump = false;
+ bool HomPrologEpilog = false;
+ bool NeedsWinCFI = false;
+
+ // Note: "HasWinCFI" is mutable as it can change in any "emit" function.
+ mutable bool HasWinCFI = false;
+
+ const TargetInstrInfo *TII = nullptr;
+ AArch64FunctionInfo *AFI = nullptr;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 1a7609bfee8a..431ed6ec34e7 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -983,7 +983,7 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
// Note: This hardware mode is enabled in AArch64Subtarget::getHwModeSet()
// (without the use of the table-gen'd predicates).
-def SMEWithZPRPredicateSpills : HwMode<"", [Predicate<"false">]>;
+def SMEWithZPRPredicateSpills : HwMode<[Predicate<"false">]>;
def PPRSpillFillRI : RegInfoByHwMode<
[DefaultMode, SMEWithZPRPredicateSpills],
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 125225df1546..601dc34d74b9 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -61,10 +61,10 @@ let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)),
(AllocateZABuffer $size)>;
-def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1,
- [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>;
+def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 2,
+ [SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPMayStore]>;
let usesCustomInserter = 1 in {
- def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {}
+ def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer, GPR64:$save_slices), [(AArch64InitTPIDR2Obj GPR64:$buffer, GPR64:$save_slices)]>, Sched<[WriteI]> {}
}
// Nodes to allocate a save buffer for SME.
@@ -93,6 +93,8 @@ let hasSideEffects = 1, isMeta = 1 in {
def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
}
+def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+
def CommitZASavePseudo
: Pseudo<(outs),
(ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
@@ -108,6 +110,11 @@ def AArch64_requires_za_save
[SDNPHasChain, SDNPInGlue]>;
def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
+def AArch64_sme_state_alloc
+ : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
+ [SDNPHasChain]>;
+def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;
+
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index eeb47b4d9975..7604ffdc9f64 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -412,6 +412,7 @@ def SDT_AArch64PTest : SDTypeProfile<1, 2, [
]>;
def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
def AArch64ptest_any : SDNode<"AArch64ISD::PTEST_ANY", SDT_AArch64PTest>;
+def AArch64ptest_first : SDNode<"AArch64ISD::PTEST_FIRST", SDT_AArch64PTest>;
def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3,
[SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0, 1>]>;
@@ -650,7 +651,7 @@ let Predicates = [HasSVE_or_SME, UseExperimentalZeroingPseudos] in {
let Predicates = [HasSVE_or_SME] in {
defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>;
- defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>;
+ defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, add>;
defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr", AArch64subr>;
defm SQADD_ZI : sve_int_arith_imm0_ssat<0b100, "sqadd", saddsat, ssubsat>;
defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>;
@@ -1071,7 +1072,7 @@ let Predicates = [HasSVE_or_SME] in {
defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>;
defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>;
- defm PTEST_PP : sve_int_ptest<0b010000, "ptest", AArch64ptest, AArch64ptest_any>;
+ defm PTEST_PP : sve_int_ptest<0b010000, "ptest", AArch64ptest, AArch64ptest_any, AArch64ptest_first>;
defm PFALSE : sve_int_pfalse<0b000000, "pfalse">;
defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
@@ -4141,8 +4142,8 @@ let Predicates = [HasSVE2_or_SME] in {
defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi, get_active_lane_mask>;
// SVE2 pointer conflict compare
- defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
- defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
+ defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", loop_dependence_war_mask>;
+ defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", loop_dependence_raw_mask>;
} // End HasSVE2_or_SME
let Predicates = [HasSVEAES, HasNonStreamingSVE_or_SSVE_AES] in {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 0f4f0129e9cd..98e0a1180510 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -91,6 +91,10 @@ static cl::opt<bool> EnableZPRPredicateSpills(
cl::desc(
"Enables spilling/reloading SVE predicates as data vectors (ZPRs)"));
+static cl::opt<unsigned>
+ VScaleForTuningOpt("sve-vscale-for-tuning", cl::Hidden,
+ cl::desc("Force a vscale for tuning factor for SVE"));
+
// Subreg liveness tracking is disabled by default for now until all issues
// are ironed out. This option allows the feature to be used in tests.
static cl::opt<bool>
@@ -364,6 +368,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize)
MinimumJumpTableEntries = AArch64MinimumJumpTableEntries;
+ if (VScaleForTuningOpt.getNumOccurrences() > 0)
+ VScaleForTuning = VScaleForTuningOpt;
}
AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 01c0bcc3a6a7..671df35cd379 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -246,8 +246,8 @@ public:
/// Return true if the CPU supports any kind of instruction fusion.
bool hasFusion() const {
return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
- hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCCSelect() ||
- hasFuseAdrpAdd() || hasFuseLiterals();
+ hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCmpCSel() ||
+ hasFuseCmpCSet() || hasFuseAdrpAdd() || hasFuseLiterals();
}
unsigned getEpilogueVectorizationMinVF() const {
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 1b0e90b0e0dc..65b752ed40c9 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -362,7 +362,7 @@ def lookupTSBByName : SearchIndex {
let Key = ["Name"];
}
-def : TSB<"csync", 0>;
+def : TSB<"csync", 2>;
//===----------------------------------------------------------------------===//
// PRFM (prefetch) instruction options.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index e67bd5869ccd..4650b2d0c815 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -589,7 +589,8 @@ void AArch64TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerLateLoopOptimizationsEPCallback(
[=](LoopPassManager &LPM, OptimizationLevel Level) {
- LPM.addPass(LoopIdiomVectorizePass());
+ if (Level != OptimizationLevel::O0)
+ LPM.addPass(LoopIdiomVectorizePass());
});
if (getTargetTriple().isOSWindows())
PB.registerPipelineEarlySimplificationEPCallback(
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 490f6391c15a..92321a76dbd8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -25,6 +25,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/TargetParser/AArch64TargetParser.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <optional>
@@ -4409,6 +4410,32 @@ AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
return 1;
}
+/// Check whether Opcode1 has less throughput according to the scheduling
+/// model than Opcode2.
+bool AArch64TTIImpl::hasKnownLowerThroughputFromSchedulingModel(
+ unsigned Opcode1, unsigned Opcode2) const {
+ const MCSchedModel &Sched = ST->getSchedModel();
+ const TargetInstrInfo *TII = ST->getInstrInfo();
+ if (!Sched.hasInstrSchedModel())
+ return false;
+
+ const MCSchedClassDesc *SCD1 =
+ Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
+ const MCSchedClassDesc *SCD2 =
+ Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
+ // We cannot handle variant scheduling classes without an MI. If we need to
+ // support them for any of the instructions we query the information of we
+ // might need to add a way to resolve them without a MI or not use the
+ // scheduling info.
+ assert(!SCD1->isVariant() && !SCD2->isVariant() &&
+ "Cannot handle variant scheduling classes without an MI");
+ if (!SCD1->isValid() || !SCD2->isValid())
+ return false;
+
+ return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
+ MCSchedModel::getReciprocalThroughput(*ST, *SCD2);
+}
+
InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
@@ -4506,6 +4533,12 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
(VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
Factor = 3; // fcmxx+fcmyy+or
+ if (isa<ScalableVectorType>(ValTy) &&
+ CostKind == TTI::TCK_RecipThroughput &&
+ hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
+ AArch64::FCMEQv4f32))
+ Factor *= 2;
+
return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
}
@@ -4937,6 +4970,23 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
if (!L->getExitBlock())
return;
+ // Check if the loop contains any reductions that could be parallelized when
+ // unrolling. If so, enable partial unrolling, if the trip count is know to be
+ // a multiple of 2.
+ bool HasParellelizableReductions =
+ L->getNumBlocks() == 1 &&
+ any_of(L->getHeader()->phis(),
+ [&SE, L](PHINode &Phi) {
+ return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
+ }) &&
+ isLoopSizeWithinBudget(L, TTI, 12, nullptr);
+ if (HasParellelizableReductions &&
+ SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
+ UP.Partial = true;
+ UP.MaxCount = 4;
+ UP.AddAdditionalAccumulators = true;
+ }
+
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
(SE.getSmallConstantMaxTripCount(L) > 0 &&
@@ -4952,6 +5002,12 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// Limit to loops with trip counts that are cheap to expand.
UP.SCEVExpansionBudget = 1;
+ if (HasParellelizableReductions) {
+ UP.Runtime = true;
+ UP.DefaultUnrollRuntimeCount = 4;
+ UP.AddAdditionalAccumulators = true;
+ }
+
// Try to unroll small loops, of few-blocks with low budget, if they have
// load/store dependencies, to expose more parallel memory access streams,
// or if they do little work inside a block (i.e. load -> X -> store pattern).
@@ -5486,13 +5542,14 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost(
}
InstructionCost
-AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
- VectorType *VecTy,
+AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
+ Type *ResTy, VectorType *VecTy,
TTI::TargetCostKind CostKind) const {
EVT VecVT = TLI->getValueType(DL, VecTy);
EVT ResVT = TLI->getValueType(DL, ResTy);
- if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) {
+ if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
+ RedOpcode == Instruction::Add) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
// The legal cases with dotprod are
@@ -5503,7 +5560,8 @@ AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
return LT.first + 2;
}
- return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind);
+ return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
+ CostKind);
}
InstructionCost
@@ -5750,11 +5808,14 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
- // A subvector extract can be implemented with an ext (or trivial extract, if
- // from lane 0). This currently only handles low or high extracts to prevent
- // SLP vectorizer regressions.
+ // A subvector extract can be implemented with a NEON/SVE ext (or trivial
+ // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
+ // This currently only handles low or high extracts to prevent SLP vectorizer
+ // regressions.
+ // Note that SVE's ext instruction is destructive, but it can be fused with
+ // a movprfx to act like a constructive instruction.
if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
- if (LT.second.is128BitVector() &&
+ if (LT.second.getFixedSizeInBits() >= 128 &&
cast<FixedVectorType>(SubTp)->getNumElements() ==
LT.second.getVectorNumElements() / 2) {
if (Index == 0)
@@ -6017,9 +6078,15 @@ static bool containsDecreasingPointers(Loop *TheLoop,
return false;
}
-bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
+bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
return SVEPreferFixedOverScalableIfEqualCost;
+ // For cases like post-LTO vectorization, when we eventually know the trip
+ // count, epilogue with fixed-width vectorization can be deleted if the trip
+ // count is less than the epilogue iterations. That's why we prefer
+ // fixed-width vectorization in epilogue in case of equal costs.
+ if (IsEpilogue)
+ return true;
return ST->useFixedOverScalableIfEqualCost();
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 42ae962b3b42..fe2e849258e3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -174,6 +174,11 @@ public:
bool prefersVectorizedAddressing() const override;
+ /// Check whether Opcode1 has less throughput according to the scheduling
+ /// model than Opcode2.
+ bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1,
+ unsigned Opcode2) const;
+
InstructionCost
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace,
@@ -424,7 +429,7 @@ public:
return TailFoldingStyle::DataWithoutLaneMask;
}
- bool preferFixedOverScalableIfEqualCost() const override;
+ bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override;
unsigned getEpilogueVectorizationMinVF() const override;
@@ -460,7 +465,7 @@ public:
TTI::TargetCostKind CostKind) const override;
InstructionCost getMulAccReductionCost(
- bool IsUnsigned, Type *ResTy, VectorType *Ty,
+ bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override;
InstructionCost
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 803943fd57c4..a8185358d6df 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -7,7 +7,8 @@ tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler
+ -ignore-non-decodable-operands)
tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel)
tablegen(LLVM AArch64GenO0PreLegalizeGICombiner.inc -gen-global-isel-combiner
@@ -91,6 +92,7 @@ add_llvm_target(AArch64CodeGen
SVEIntrinsicOpts.cpp
MachineSMEABIPass.cpp
AArch64SIMDInstrOpt.cpp
+ AArch64PrologueEpilogue.cpp
DEPENDS
intrinsics_gen
@@ -107,6 +109,7 @@ add_llvm_target(AArch64CodeGen
Core
GlobalISel
MC
+ Passes
Scalar
SelectionDAG
Support
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 323db2a0728e..aa1c1c882e22 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -35,308 +35,14 @@ using namespace llvm::MCD;
// Pull DecodeStatus and its enum values into the global namespace.
using DecodeStatus = MCDisassembler::DecodeStatus;
-// Forward declare these because the autogenerated code will reference them.
-// Definitions are further down.
-template <unsigned RegClassID, unsigned FirstReg, unsigned NumRegsInClass>
-static DecodeStatus DecodeSimpleRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
- const MCDisassembler *Decoder);
-template <unsigned Min, unsigned Max>
-static DecodeStatus DecodeZPRMul2_MinMax(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeZK(MCInst &Inst, unsigned RegNo, uint64_t Address,
- const MCDisassembler *Decoder);
-template <unsigned Min, unsigned Max>
-static DecodeStatus DecodeZPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder);
-static DecodeStatus DecodeZPR4Mul4RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder);
-template <unsigned NumBitsForTile>
-static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodePPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder);
-
-static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodePCRelLabel9(MCInst &Inst, unsigned Imm,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeSystemPStateImm0_15Instruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeSystemPStateImm0_1Instruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSyspXzrInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
template <int Bits>
static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
const MCDisassembler *Decoder);
-template <int ElementWidth>
-static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Addr,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-#include "AArch64GenDisassemblerTables.inc"
-#include "AArch64GenInstrInfo.inc"
#define Success MCDisassembler::Success
#define Fail MCDisassembler::Fail
#define SoftFail MCDisassembler::SoftFail
-static MCDisassembler *createAArch64Disassembler(const Target &T,
- const MCSubtargetInfo &STI,
- MCContext &Ctx) {
-
- return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo());
-}
-
-DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
- ArrayRef<uint8_t> Bytes,
- uint64_t Address,
- raw_ostream &CS) const {
- CommentStream = &CS;
-
- Size = 0;
- // We want to read exactly 4 bytes of data.
- if (Bytes.size() < 4)
- return Fail;
- Size = 4;
-
- // Encoded as a small-endian 32-bit word in the stream.
- uint32_t Insn =
- (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
-
- const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32};
-
- for (const auto *Table : Tables) {
- DecodeStatus Result =
- decodeInstruction(Table, MI, Insn, Address, this, STI);
-
- const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
-
- // For Scalable Matrix Extension (SME) instructions that have an implicit
- // operand for the accumulator (ZA) or implicit immediate zero which isn't
- // encoded, manually insert operand.
- for (unsigned i = 0; i < Desc.getNumOperands(); i++) {
- if (Desc.operands()[i].OperandType == MCOI::OPERAND_REGISTER) {
- switch (Desc.operands()[i].RegClass) {
- default:
- break;
- case AArch64::MPRRegClassID:
- MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZA));
- break;
- case AArch64::MPR8RegClassID:
- MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZAB0));
- break;
- case AArch64::ZTRRegClassID:
- MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZT0));
- break;
- }
- } else if (Desc.operands()[i].OperandType ==
- AArch64::OPERAND_IMPLICIT_IMM_0) {
- MI.insert(MI.begin() + i, MCOperand::createImm(0));
- }
- }
-
- if (MI.getOpcode() == AArch64::LDR_ZA ||
- MI.getOpcode() == AArch64::STR_ZA) {
- // Spill and fill instructions have a single immediate used for both
- // the vector select offset and optional memory offset. Replicate
- // the decoded immediate.
- const MCOperand &Imm4Op = MI.getOperand(2);
- assert(Imm4Op.isImm() && "Unexpected operand type!");
- MI.addOperand(Imm4Op);
- }
-
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- return MCDisassembler::Fail;
-}
-
-uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
- uint64_t Address) const {
- // AArch64 instructions are always 4 bytes wide, so there's no point
- // in skipping any smaller number of bytes if an instruction can't
- // be decoded.
- return 4;
-}
-
-static MCSymbolizer *
-createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
- LLVMSymbolLookupCallback SymbolLookUp,
- void *DisInfo, MCContext *Ctx,
- std::unique_ptr<MCRelocationInfo> &&RelInfo) {
- return new AArch64ExternalSymbolizer(*Ctx, std::move(RelInfo), GetOpInfo,
- SymbolLookUp, DisInfo);
-}
-
-extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
-LLVMInitializeAArch64Disassembler() {
- TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(),
- createAArch64Disassembler);
- TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(),
- createAArch64Disassembler);
- TargetRegistry::RegisterMCSymbolizer(getTheAArch64leTarget(),
- createAArch64ExternalSymbolizer);
- TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(),
- createAArch64ExternalSymbolizer);
- TargetRegistry::RegisterMCDisassembler(getTheAArch64_32Target(),
- createAArch64Disassembler);
- TargetRegistry::RegisterMCSymbolizer(getTheAArch64_32Target(),
- createAArch64ExternalSymbolizer);
-
- TargetRegistry::RegisterMCDisassembler(getTheARM64Target(),
- createAArch64Disassembler);
- TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(),
- createAArch64ExternalSymbolizer);
- TargetRegistry::RegisterMCDisassembler(getTheARM64_32Target(),
- createAArch64Disassembler);
- TargetRegistry::RegisterMCSymbolizer(getTheARM64_32Target(),
- createAArch64ExternalSymbolizer);
-}
-
template <unsigned RegClassID, unsigned FirstReg, unsigned NumRegsInClass>
static DecodeStatus DecodeSimpleRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
@@ -492,11 +198,7 @@ static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
uint64_t Addr,
const MCDisassembler *Decoder) {
- int64_t ImmVal = Imm;
-
- // Sign-extend 19-bit immediate.
- if (ImmVal & (1 << (19 - 1)))
- ImmVal |= ~((1LL << 19) - 1);
+ int64_t ImmVal = SignExtend64<19>(Imm);
if (!Decoder->tryAddingSymbolicOperand(
Inst, ImmVal * 4, Addr, Inst.getOpcode() != AArch64::LDRXl, 0, 0, 4))
@@ -506,11 +208,7 @@ static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
static DecodeStatus DecodePCRelLabel9(MCInst &Inst, unsigned Imm, uint64_t Addr,
const MCDisassembler *Decoder) {
- int64_t ImmVal = Imm;
-
- // Sign-extend 9-bit immediate.
- if (ImmVal & (1 << (9 - 1)))
- ImmVal |= ~((1LL << 9) - 1);
+ int64_t ImmVal = SignExtend64<9>(Imm);
if (!Decoder->tryAddingSymbolicOperand(Inst, (ImmVal * 4), Addr,
/*IsBranch=*/true, 0, 0, 4))
@@ -827,12 +525,7 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
const MCDisassembler *Decoder) {
unsigned Rt = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
- int64_t offset = fieldFromInstruction(insn, 12, 9);
-
- // offset is a 9-bit signed immediate, so sign extend it to
- // fill the unsigned.
- if (offset & (1 << (9 - 1)))
- offset |= ~((1LL << 9) - 1);
+ int64_t offset = SignExtend64<9>(fieldFromInstruction(insn, 12, 9));
// First operand is always the writeback to the address register, if needed.
switch (Inst.getOpcode()) {
@@ -1129,14 +822,9 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
unsigned Rt = fieldFromInstruction(insn, 0, 5);
unsigned Rn = fieldFromInstruction(insn, 5, 5);
unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
- int64_t offset = fieldFromInstruction(insn, 15, 7);
+ int64_t offset = SignExtend64<7>(fieldFromInstruction(insn, 15, 7));
bool IsLoad = fieldFromInstruction(insn, 22, 1);
- // offset is a 7-bit signed immediate, so sign extend it to
- // fill the unsigned.
- if (offset & (1 << (7 - 1)))
- offset |= ~((1LL << 7) - 1);
-
unsigned Opcode = Inst.getOpcode();
bool NeedsDisjointWritebackTransfer = false;
@@ -1505,12 +1193,8 @@ static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn,
uint64_t Addr,
const MCDisassembler *Decoder) {
unsigned Rd = fieldFromInstruction(insn, 0, 5);
- int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
- imm |= fieldFromInstruction(insn, 29, 2);
-
- // Sign-extend the 21-bit immediate.
- if (imm & (1 << (21 - 1)))
- imm |= ~((1LL << 21) - 1);
+ int64_t imm = SignExtend64<21>((fieldFromInstruction(insn, 5, 19) << 2) |
+ fieldFromInstruction(insn, 29, 2));
DecodeSimpleRegisterClass<AArch64::GPR64RegClassID, 0, 32>(Inst, Rd, Addr,
Decoder);
@@ -1564,11 +1248,7 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn,
uint64_t Addr,
const MCDisassembler *Decoder) {
- int64_t imm = fieldFromInstruction(insn, 0, 26);
-
- // Sign-extend the 26-bit immediate.
- if (imm & (1 << (26 - 1)))
- imm |= ~((1LL << 26) - 1);
+ int64_t imm = SignExtend64<26>(fieldFromInstruction(insn, 0, 26));
if (!Decoder->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 0, 4))
Inst.addOperand(MCOperand::createImm(imm));
@@ -1631,11 +1311,7 @@ static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
uint64_t Rt = fieldFromInstruction(insn, 0, 5);
uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
bit |= fieldFromInstruction(insn, 19, 5);
- int64_t dst = fieldFromInstruction(insn, 5, 14);
-
- // Sign-extend 14-bit immediate.
- if (dst & (1 << (14 - 1)))
- dst |= ~((1LL << 14) - 1);
+ int64_t dst = SignExtend64<14>(fieldFromInstruction(insn, 5, 14));
if (fieldFromInstruction(insn, 31, 1) == 0)
DecodeSimpleRegisterClass<AArch64::GPR32RegClassID, 0, 32>(Inst, Rt, Addr,
@@ -1856,3 +1532,129 @@ static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn,
return Success;
}
+
+static DecodeStatus
+DecodeSMESpillFillInstruction(MCInst &Inst, uint32_t Bits, uint64_t Addr,
+ const MCDisassembler *Decoder) {
+ unsigned RvBits = fieldFromInstruction(Bits, 13, 2);
+ unsigned RnBits = fieldFromInstruction(Bits, 5, 5);
+ unsigned Imm4Bits = fieldFromInstruction(Bits, 0, 4);
+
+ DecodeSimpleRegisterClass<AArch64::MatrixIndexGPR32_12_15RegClassID, 0, 4>(
+ Inst, RvBits, Addr, Decoder);
+ Inst.addOperand(MCOperand::createImm(Imm4Bits));
+ DecodeSimpleRegisterClass<AArch64::GPR64spRegClassID, 0, 32>(Inst, RnBits,
+ Addr, Decoder);
+ // Spill and fill instructions have a single immediate used for both
+ // the vector select offset and optional memory offset. Replicate
+ // the decoded immediate.
+ Inst.addOperand(MCOperand::createImm(Imm4Bits));
+ return Success;
+}
+
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+
+ return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo());
+}
+
+DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+
+ Size = 0;
+ // We want to read exactly 4 bytes of data.
+ if (Bytes.size() < 4)
+ return Fail;
+ Size = 4;
+
+ // Encoded as a small-endian 32-bit word in the stream.
+ uint32_t Insn =
+ (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+
+ const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32};
+
+ for (const auto *Table : Tables) {
+ DecodeStatus Result =
+ decodeInstruction(Table, MI, Insn, Address, this, STI);
+
+ const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+
+ // For Scalable Matrix Extension (SME) instructions that have an implicit
+ // operand for the accumulator (ZA) or implicit immediate zero which isn't
+ // encoded, manually insert operand.
+ for (unsigned i = 0; i < Desc.getNumOperands(); i++) {
+ if (Desc.operands()[i].OperandType == MCOI::OPERAND_REGISTER) {
+ switch (Desc.operands()[i].RegClass) {
+ default:
+ break;
+ case AArch64::MPRRegClassID:
+ MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZA));
+ break;
+ case AArch64::MPR8RegClassID:
+ MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZAB0));
+ break;
+ case AArch64::ZTRRegClassID:
+ MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZT0));
+ break;
+ }
+ } else if (Desc.operands()[i].OperandType ==
+ AArch64::OPERAND_IMPLICIT_IMM_0) {
+ MI.insert(MI.begin() + i, MCOperand::createImm(0));
+ }
+ }
+
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ return MCDisassembler::Fail;
+}
+
+uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const {
+ // AArch64 instructions are always 4 bytes wide, so there's no point
+ // in skipping any smaller number of bytes if an instruction can't
+ // be decoded.
+ return 4;
+}
+
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
+ LLVMSymbolLookupCallback SymbolLookUp,
+ void *DisInfo, MCContext *Ctx,
+ std::unique_ptr<MCRelocationInfo> &&RelInfo) {
+ return new AArch64ExternalSymbolizer(*Ctx, std::move(RelInfo), GetOpInfo,
+ SymbolLookUp, DisInfo);
+}
+
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64Disassembler() {
+ TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheAArch64leTarget(),
+ createAArch64ExternalSymbolizer);
+ TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(),
+ createAArch64ExternalSymbolizer);
+ TargetRegistry::RegisterMCDisassembler(getTheAArch64_32Target(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheAArch64_32Target(),
+ createAArch64ExternalSymbolizer);
+
+ TargetRegistry::RegisterMCDisassembler(getTheARM64Target(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(),
+ createAArch64ExternalSymbolizer);
+ TargetRegistry::RegisterMCDisassembler(getTheARM64_32Target(),
+ createAArch64Disassembler);
+ TargetRegistry::RegisterMCSymbolizer(getTheARM64_32Target(),
+ createAArch64ExternalSymbolizer);
+}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 0bceb322726d..5748556d0728 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6608,45 +6608,6 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
switch (IntrinID) {
default:
break;
- case Intrinsic::aarch64_crypto_sha1h: {
- Register DstReg = I.getOperand(0).getReg();
- Register SrcReg = I.getOperand(2).getReg();
-
- // FIXME: Should this be an assert?
- if (MRI.getType(DstReg).getSizeInBits() != 32 ||
- MRI.getType(SrcReg).getSizeInBits() != 32)
- return false;
-
- // The operation has to happen on FPRs. Set up some new FPR registers for
- // the source and destination if they are on GPRs.
- if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
- SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
- MIB.buildCopy({SrcReg}, {I.getOperand(2)});
-
- // Make sure the copy ends up getting constrained properly.
- RBI.constrainGenericRegister(I.getOperand(2).getReg(),
- AArch64::GPR32RegClass, MRI);
- }
-
- if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
- DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
-
- // Actually insert the instruction.
- auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
- constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
-
- // Did we create a new register for the destination?
- if (DstReg != I.getOperand(0).getReg()) {
- // Yep. Copy the result of the instruction back into the original
- // destination.
- MIB.buildCopy({I.getOperand(0)}, {DstReg});
- RBI.constrainGenericRegister(I.getOperand(0).getReg(),
- AArch64::GPR32RegClass, MRI);
- }
-
- I.eraseFromParent();
- return true;
- }
case Intrinsic::ptrauth_resign: {
Register DstReg = I.getOperand(0).getReg();
Register ValReg = I.getOperand(2).getReg();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 210643f6f2f4..ff09b375c310 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -222,7 +222,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampNumElements(0, v2s64, v2s64)
.moreElementsToNextPow2(0)
.minScalarSameAs(1, 0)
- .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
+ .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+ .minScalarEltSameAsIf(isVector(0), 1, 0)
+ .maxScalarEltSameAsIf(isVector(0), 1, 0);
getActionDefinitionsBuilder(G_PTR_ADD)
.legalFor({{p0, s64}, {v2p0, v2s64}})
@@ -879,8 +881,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{v2s32, v2s32},
{v4s32, v4s32},
{v2s64, v2s64}})
- .legalFor(HasFP16,
- {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
+ .legalFor(
+ HasFP16,
+ {{s16, s16}, {s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
// Handle types larger than i64 by scalarizing/lowering.
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
.scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
@@ -1150,7 +1153,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampMaxNumElements(1, s32, 4)
.clampMaxNumElements(1, s16, 8)
.clampMaxNumElements(1, s8, 16)
- .clampMaxNumElements(1, p0, 2);
+ .clampMaxNumElements(1, p0, 2)
+ .scalarizeIf(scalarOrEltWiderThan(1, 64), 1);
getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
.legalIf(
@@ -1165,7 +1169,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampNumElements(0, v4s16, v8s16)
.clampNumElements(0, v2s32, v4s32)
.clampMaxNumElements(0, s64, 2)
- .clampMaxNumElements(0, p0, 2);
+ .clampMaxNumElements(0, p0, 2)
+ .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
getActionDefinitionsBuilder(G_BUILD_VECTOR)
.legalFor({{v8s8, s8},
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 1b919abd222e..62de86bf87f5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -482,6 +482,10 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
case Intrinsic::aarch64_neon_sqrdmulh:
case Intrinsic::aarch64_neon_sqadd:
case Intrinsic::aarch64_neon_sqsub:
+ case Intrinsic::aarch64_crypto_sha1h:
+ case Intrinsic::aarch64_crypto_sha1c:
+ case Intrinsic::aarch64_crypto_sha1p:
+ case Intrinsic::aarch64_crypto_sha1m:
return true;
case Intrinsic::aarch64_neon_saddlv: {
const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
@@ -848,10 +852,20 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
break;
}
+ case TargetOpcode::G_FPTOSI_SAT:
+ case TargetOpcode::G_FPTOUI_SAT: {
+ LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+ if (DstType.isVector())
+ break;
+ if (DstType == LLT::scalar(16)) {
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+ break;
+ }
+ OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
+ break;
+ }
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI:
- case TargetOpcode::G_FPTOSI_SAT:
- case TargetOpcode::G_FPTOUI_SAT:
case TargetOpcode::G_INTRINSIC_LRINT:
case TargetOpcode::G_INTRINSIC_LLRINT:
if (MRI.getType(MI.getOperand(0).getReg()).isVector())
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 54b58e948daf..2552ee300933 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -365,13 +365,6 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
return;
}
- // Instruction TSB is specified as a one operand instruction, but 'csync' is
- // not encoded, so for printing it is treated as a special case here:
- if (Opcode == AArch64::TSB) {
- O << "\ttsb\tcsync";
- return;
- }
-
if (!PrintAliases || !printAliasInstr(MI, Address, STI, O))
printInstruction(MI, Address, STI, O);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 828c5c546240..2b5cf3484ffc 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -53,9 +53,9 @@ const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
{AArch64::S_MACHO_TLVPPAGEOFF, "TLVPPAGEOFF"},
};
-StringRef AArch64::getSpecifierName(const MCSpecifierExpr &Expr) {
+StringRef AArch64::getSpecifierName(AArch64::Specifier S) {
// clang-format off
- switch (static_cast<uint32_t>(Expr.getSpecifier())) {
+ switch (static_cast<uint32_t>(S)) {
case AArch64::S_CALL: return "";
case AArch64::S_LO12: return ":lo12:";
case AArch64::S_ABS_G3: return ":abs_g3:";
@@ -124,7 +124,7 @@ static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
return false;
Res.setSpecifier(Expr.getSpecifier());
- return true;
+ return !Res.getSubSym();
}
AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
@@ -183,7 +183,7 @@ void AArch64MCAsmInfoDarwin::printSpecifierExpr(
raw_ostream &OS, const MCSpecifierExpr &Expr) const {
if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
return AE->print(OS, this);
- OS << AArch64::getSpecifierName(Expr);
+ OS << AArch64::getSpecifierName(Expr.getSpecifier());
printExpr(OS, *Expr.getSubExpr());
}
@@ -232,7 +232,7 @@ void AArch64MCAsmInfoELF::printSpecifierExpr(
raw_ostream &OS, const MCSpecifierExpr &Expr) const {
if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
return AE->print(OS, this);
- OS << AArch64::getSpecifierName(Expr);
+ OS << AArch64::getSpecifierName(Expr.getSpecifier());
printExpr(OS, *Expr.getSubExpr());
}
@@ -262,7 +262,7 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
void AArch64MCAsmInfoMicrosoftCOFF::printSpecifierExpr(
raw_ostream &OS, const MCSpecifierExpr &Expr) const {
- OS << AArch64::getSpecifierName(Expr);
+ OS << AArch64::getSpecifierName(Expr.getSpecifier());
printExpr(OS, *Expr.getSubExpr());
}
@@ -292,7 +292,7 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
void AArch64MCAsmInfoGNUCOFF::printSpecifierExpr(
raw_ostream &OS, const MCSpecifierExpr &Expr) const {
- OS << AArch64::getSpecifierName(Expr);
+ OS << AArch64::getSpecifierName(Expr.getSpecifier());
printExpr(OS, *Expr.getSubExpr());
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index c28e925d77e2..0dfa61b1dc60 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -181,7 +181,7 @@ enum {
/// Return the string representation of the ELF relocation specifier
/// (e.g. ":got:", ":lo12:").
-StringRef getSpecifierName(const MCSpecifierExpr &Expr);
+StringRef getSpecifierName(Specifier S);
inline Specifier getSymbolLoc(Specifier S) {
return static_cast<Specifier>(S & AArch64::S_SymLocBits);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 91bdc880998b..7774d07a214b 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -65,15 +65,16 @@ void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
bool isHForm(const MCInst &MI, const MCInstrInfo *MCII);
bool isQForm(const MCInst &MI, const MCInstrInfo *MCII);
bool isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII);
-}
+} // namespace AArch64_MC
namespace AArch64 {
enum OperandType {
OPERAND_IMPLICIT_IMM_0 = MCOI::OPERAND_FIRST_TARGET,
+ OPERAND_SHIFT_MSL,
};
} // namespace AArch64
-} // End llvm namespace
+} // namespace llvm
// Defines symbolic names for AArch64 registers. This defines a mapping from
// register name to register number.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index a53b676142a0..5fe999389ce7 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -73,9 +73,10 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
// Supported
break;
default:
- Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
- AArch64::getSpecifierName(*A64E) +
- " unsupported on COFF targets");
+ Ctx.reportError(Fixup.getLoc(),
+ "relocation specifier " +
+ AArch64::getSpecifierName(A64E->getSpecifier()) +
+ " unsupported on COFF targets");
return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
}
}
@@ -83,9 +84,10 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
switch (FixupKind) {
default: {
if (auto *A64E = dyn_cast<MCSpecifierExpr>(Expr)) {
- Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
- AArch64::getSpecifierName(*A64E) +
- " unsupported on COFF targets");
+ Ctx.reportError(Fixup.getLoc(),
+ "relocation specifier " +
+ AArch64::getSpecifierName(A64E->getSpecifier()) +
+ " unsupported on COFF targets");
} else {
MCFixupKindInfo Info = MAB.getFixupKindInfo(Fixup.getKind());
Ctx.reportError(Fixup.getLoc(), Twine("relocation type ") + Info.Name +
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index b58dfdf32e4a..c39a5cc2fcb1 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
//
// This pass implements the SME ABI requirements for ZA state. This includes
-// implementing the lazy ZA state save schemes around calls.
+// implementing the lazy (and agnostic) ZA state save schemes around calls.
//
//===----------------------------------------------------------------------===//
//
@@ -139,8 +139,8 @@ StringRef getZAStateString(ZAState State) {
#undef MAKE_CASE
}
-static bool isZAorZT0RegOp(const TargetRegisterInfo &TRI,
- const MachineOperand &MO) {
+static bool isZAorZTRegOp(const TargetRegisterInfo &TRI,
+ const MachineOperand &MO) {
if (!MO.isReg() || !MO.getReg().isPhysical())
return false;
return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) {
@@ -166,7 +166,7 @@ getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
for (auto &MO : MI.operands()) {
- if (isZAorZT0RegOp(TRI, MO))
+ if (isZAorZTRegOp(TRI, MO))
return {ZAState::ACTIVE, InsertPt};
}
@@ -215,9 +215,44 @@ struct MachineSMEABI : public MachineFunctionPass {
void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
bool ClearTPIDR2);
+ // Emission routines for agnostic ZA functions.
+ void emitSetupFullZASave(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs);
+ // Emit a "full" ZA save or restore. It is "full" in the sense that this
+ // function will emit a call to __arm_sme_save or __arm_sme_restore, which
+ // handles saving and restoring both ZA and ZT0.
+ void emitFullZASaveRestore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsSave);
+ void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs);
+
void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
ZAState From, ZAState To, LiveRegs PhysLiveRegs);
+ // Helpers for switching between lazy/full ZA save/restore routines.
+ void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs) {
+ if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
+ return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
+ return emitSetupLazySave(MBB, MBBI);
+ }
+ void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs) {
+ if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
+ return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
+ return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
+ }
+ void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs) {
+ if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
+ return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
+ return emitAllocateLazySaveBuffer(MBB, MBBI);
+ }
+
/// Save live physical registers to virtual registers.
PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, DebugLoc DL);
@@ -228,6 +263,8 @@ struct MachineSMEABI : public MachineFunctionPass {
/// Get or create a TPIDR2 block in this function.
TPIDR2State getTPIDR2Block();
+ Register getAgnosticZABufferPtr();
+
private:
/// Contains the needed ZA state (and live registers) at an instruction.
struct InstInfo {
@@ -241,6 +278,7 @@ private:
struct BlockInfo {
ZAState FixedEntryState{ZAState::ANY};
SmallVector<InstInfo> Insts;
+ LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
};
@@ -249,24 +287,29 @@ private:
SmallVector<BlockInfo> Blocks;
SmallVector<ZAState> BundleStates;
std::optional<TPIDR2State> TPIDR2Block;
+ std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
+ Register AgnosticZABufferPtr = AArch64::NoRegister;
+ LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
} State;
MachineFunction *MF = nullptr;
EdgeBundles *Bundles = nullptr;
const AArch64Subtarget *Subtarget = nullptr;
const AArch64RegisterInfo *TRI = nullptr;
+ const AArch64FunctionInfo *AFI = nullptr;
const TargetInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
};
void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
- assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) &&
+ assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
+ SMEFnAttrs.hasZAState()) &&
"Expected function to have ZA/ZT0 state!");
State.Blocks.resize(MF->getNumBlockIDs());
for (MachineBasicBlock &MBB : *MF) {
BlockInfo &Block = State.Blocks[MBB.getNumber()];
- if (&MBB == &MF->front()) {
+ if (MBB.isEntryBlock()) {
// Entry block:
Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
? ZAState::CALLER_DORMANT
@@ -294,10 +337,20 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
+ auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
for (MachineInstr &MI : reverse(MBB)) {
MachineBasicBlock::iterator MBBI(MI);
LiveUnits.stepBackward(MI);
LiveRegs PhysLiveRegs = GetPhysLiveRegs();
+ // The SMEStateAllocPseudo marker is added to a function if the save
+ // buffer was allocated in SelectionDAG. It marks the end of the
+ // allocation -- which is a safe point for this pass to insert any TPIDR2
+ // block setup.
+ if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
+ State.AfterSMEProloguePt = MBBI;
+ State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
+ }
+ // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
auto [NeededState, InsertPt] = getZAStateBeforeInst(
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
assert((InsertPt == MBBI ||
@@ -306,6 +359,8 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
// TODO: Do something to avoid state changes where NZCV is live.
if (MBBI == FirstTerminatorInsertPt)
Block.PhysLiveRegsAtExit = PhysLiveRegs;
+ if (MBBI == FirstNonPhiInsertPt)
+ Block.PhysLiveRegsAtEntry = PhysLiveRegs;
if (NeededState != ZAState::ANY)
Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
}
@@ -529,23 +584,25 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
void MachineSMEABI::emitAllocateLazySaveBuffer(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
MachineFrameInfo &MFI = MF->getFrameInfo();
-
DebugLoc DL = getDebugLoc(MBB, MBBI);
Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
- Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
// Calculate SVL.
BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
// 1. Allocate the lazy save buffer.
- {
- // TODO This function grows the stack with a subtraction, which doesn't work
- // on Windows. Some refactoring to share the functionality in
- // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
- // supports SME
+ if (Buffer == AArch64::NoRegister) {
+ // TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
+ // Buffer != AArch64::NoRegister). This is done to reuse the existing
+ // expansions (which can insert stack checks). This works, but it means we
+ // will always allocate the lazy save buffer (even if the function contains
+ // no lazy saves). If we want to handle Windows here, we'll need to
+ // implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
assert(!Subtarget->isTargetWindows() &&
"Lazy ZA save is not yet supported on Windows");
+ Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
// Get original stack pointer.
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
.addReg(AArch64::SP);
@@ -590,8 +647,7 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
.addImm(AArch64SysReg::TPIDR2_EL0);
// If TPIDR2_EL0 is non-zero, commit the lazy save.
// NOTE: Functions that only use ZT0 don't need to zero ZA.
- bool ZeroZA =
- MF->getInfo<AArch64FunctionInfo>()->getSMEFnAttrs().hasZAState();
+ bool ZeroZA = AFI->getSMEFnAttrs().hasZAState();
auto CommitZASave =
BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
.addReg(TPIDR2EL0)
@@ -606,6 +662,86 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
.addImm(1);
}
+Register MachineSMEABI::getAgnosticZABufferPtr() {
+ if (State.AgnosticZABufferPtr != AArch64::NoRegister)
+ return State.AgnosticZABufferPtr;
+ Register BufferPtr = AFI->getEarlyAllocSMESaveBuffer();
+ State.AgnosticZABufferPtr =
+ BufferPtr != AArch64::NoRegister
+ ? BufferPtr
+ : MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ return State.AgnosticZABufferPtr;
+}
+
+void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsSave) {
+ auto *TLI = Subtarget->getTargetLowering();
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+ Register BufferPtr = AArch64::X0;
+
+ PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
+
+ // Copy the buffer pointer into X0.
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
+ .addReg(getAgnosticZABufferPtr());
+
+ // Call __arm_sme_save/__arm_sme_restore.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+ .addReg(BufferPtr, RegState::Implicit)
+ .addExternalSymbol(TLI->getLibcallName(
+ IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE))
+ .addRegMask(TRI->getCallPreservedMask(
+ *MF,
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+
+ restorePhyRegSave(RegSave, MBB, MBBI, DL);
+}
+
+void MachineSMEABI::emitAllocateFullZASaveBuffer(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs) {
+ // Buffer already allocated in SelectionDAG.
+ if (AFI->getEarlyAllocSMESaveBuffer())
+ return;
+
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+ Register BufferPtr = getAgnosticZABufferPtr();
+ Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+
+ PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
+
+ // Calculate the SME state size.
+ {
+ auto *TLI = Subtarget->getTargetLowering();
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+ .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE))
+ .addReg(AArch64::X0, RegState::ImplicitDefine)
+ .addRegMask(TRI->getCallPreservedMask(
+ *MF, CallingConv::
+ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize)
+ .addReg(AArch64::X0);
+ }
+
+ // Allocate a buffer object of the size given __arm_sme_state_size.
+ {
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
+ .addReg(AArch64::SP)
+ .addReg(BufferSize)
+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
+ .addReg(AArch64::SP);
+
+ // We have just allocated a variable sized object, tell this to PEI.
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+
+ restorePhyRegSave(RegSave, MBB, MBBI, DL);
+}
+
void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt,
ZAState From, ZAState To,
@@ -623,10 +759,7 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
// TODO: Avoid setting up the save buffer if there's no transition to
// LOCAL_SAVED.
if (From == ZAState::CALLER_DORMANT) {
- assert(MBB.getParent()
- ->getInfo<AArch64FunctionInfo>()
- ->getSMEFnAttrs()
- .hasPrivateZAInterface() &&
+ assert(AFI->getSMEFnAttrs().hasPrivateZAInterface() &&
"CALLER_DORMANT state requires private ZA interface");
assert(&MBB == &MBB.getParent()->front() &&
"CALLER_DORMANT state only valid in entry block");
@@ -641,12 +774,14 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
}
if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
- emitSetupLazySave(MBB, InsertPt);
+ emitZASave(MBB, InsertPt, PhysLiveRegs);
else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
- emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs);
+ emitZARestore(MBB, InsertPt, PhysLiveRegs);
else if (To == ZAState::OFF) {
assert(From != ZAState::CALLER_DORMANT &&
"CALLER_DORMANT to OFF should have already been handled");
+ assert(!AFI->getSMEFnAttrs().hasAgnosticZAInterface() &&
+ "Should not turn ZA off in agnostic ZA function");
emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
} else {
dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
@@ -664,9 +799,10 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
return false;
- auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+ AFI = MF.getInfo<AArch64FunctionInfo>();
SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
- if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State())
+ if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
+ !SMEFnAttrs.hasAgnosticZAInterface())
return false;
assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
@@ -685,9 +821,19 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
insertStateChanges();
// Allocate save buffer (if needed).
- if (State.TPIDR2Block) {
- MachineBasicBlock &EntryBlock = MF.front();
- emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+ if (State.AgnosticZABufferPtr != AArch64::NoRegister || State.TPIDR2Block) {
+ if (State.AfterSMEProloguePt) {
+ // Note: With inline stack probes the AfterSMEProloguePt may not be in the
+ // entry block (due to the probing loop).
+ emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
+ *State.AfterSMEProloguePt,
+ State.PhysLiveRegsAfterSMEPrologue);
+ } else {
+ MachineBasicBlock &EntryBlock = MF.front();
+ emitAllocateZASaveBuffer(
+ EntryBlock, EntryBlock.getFirstNonPHI(),
+ State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry);
+ }
}
return true;
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 2008516885c3..79ceb2ababc7 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -50,8 +50,7 @@ private:
char SMEABI::ID = 0;
static const char *name = "SME ABI Pass";
-INITIALIZE_PASS_BEGIN(SMEABI, DEBUG_TYPE, name, false, false)
-INITIALIZE_PASS_END(SMEABI, DEBUG_TYPE, name, false, false)
+INITIALIZE_PASS(SMEABI, DEBUG_TYPE, name, false, false)
FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); }
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index b3005d512022..40ec371fe79d 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -1108,6 +1108,10 @@ class sme_spill_fill_base<bit isStore, dag outs, dag ins, string opcodestr>
: I<outs, ins, opcodestr, "\t$ZAt[$Rv, $imm4], [$Rn, $offset, mul vl]", "",
[]>,
Sched<[]> {
+ // 'offset' operand is encoded in the same bits as 'imm4'. There is currently
+ // no way to tell TableGen about this.
+ let DecoderMethod = "DecodeSMESpillFillInstruction";
+ bits<0> ZAt;
bits<2> Rv;
bits<5> Rn;
bits<4> imm4;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a3a7d0f74e1b..f8c1fe81c678 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -315,10 +315,16 @@ def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16
def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>;
def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>;
-def SVEAddSubImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>;
-def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>;
-def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>;
-def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64>", []>;
+let Complexity = 1 in {
+def SVEAddSubImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8, false>", []>;
+def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16, false>", []>;
+def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32, false>", []>;
+def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64, false>", []>;
+
+def SVEAddSubNegImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8, true>", []>;
+def SVEAddSubNegImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16, true>", []>;
+def SVEAddSubNegImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32, true>", []>;
+def SVEAddSubNegImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64, true>", []>;
def SVEAddSubSSatNegImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i8, true>", []>;
def SVEAddSubSSatNegImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i16, true>", []>;
@@ -329,6 +335,7 @@ def SVEAddSubSSatPosImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MV
def SVEAddSubSSatPosImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i16, false>", []>;
def SVEAddSubSSatPosImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i32, false>", []>;
def SVEAddSubSSatPosImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubSSatImm<MVT::i64, false>", []>;
+} // Complexity = 1
def SVECpyDupImm8Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i8>", []>;
def SVECpyDupImm16Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i16>", []>;
@@ -886,13 +893,17 @@ class sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op>
}
multiclass sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op,
- SDPatternOperator op_any> {
+ SDPatternOperator op_any, SDPatternOperator op_first> {
def NAME : sve_int_ptest<opc, asm, op>;
let hasNoSchedulingInfo = 1, isCompare = 1, Defs = [NZCV] in {
def _ANY : Pseudo<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
[(set NZCV, (op_any (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn)))]>,
PseudoInstExpansion<(!cast<Instruction>(NAME) PPRAny:$Pg, PPR8:$Pn)>;
+
+ def _FIRST : Pseudo<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
+ [(set NZCV, (op_first (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn)))]>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME) PPRAny:$Pg, PPR8:$Pn)>;
}
}
@@ -5154,11 +5165,14 @@ multiclass sve_int_dup_imm<string asm> {
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, cpy_imm8_opt_lsl_i64:$imm), 1>;
def : InstAlias<"fmov $Zd, #0.0",
- (!cast<Instruction>(NAME # _H) ZPR16:$Zd, 0, 0), 1>;
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd,
+ (cpy_imm8_opt_lsl_i16 0, 0)), 1>;
def : InstAlias<"fmov $Zd, #0.0",
- (!cast<Instruction>(NAME # _S) ZPR32:$Zd, 0, 0), 1>;
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd,
+ (cpy_imm8_opt_lsl_i32 0, 0)), 1>;
def : InstAlias<"fmov $Zd, #0.0",
- (!cast<Instruction>(NAME # _D) ZPR64:$Zd, 0, 0), 1>;
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd,
+ (cpy_imm8_opt_lsl_i64 0, 0)), 1>;
}
class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
@@ -5218,7 +5232,8 @@ class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
let hasSideEffects = 0;
}
-multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op,
+ SDPatternOperator inv_op = null_frag> {
def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
@@ -5228,6 +5243,12 @@ multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
+
+ // Extra patterns for add(x, splat(-ve)) -> sub(x, +ve). There is no i8
+ // pattern as all i8 constants can be handled by an add.
+ def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, inv_op, ZPR16, i32, SVEAddSubNegImm16Pat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, inv_op, ZPR32, i32, SVEAddSubNegImm32Pat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, inv_op, ZPR64, i64, SVEAddSubNegImm64Pat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm0_ssat<bits<3> opc, string asm, SDPatternOperator op,
@@ -5549,11 +5570,14 @@ multiclass sve_int_dup_imm_pred_merge<string asm, SDPatternOperator op> {
nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
- (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
+ (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg,
+ (cpy_imm8_opt_lsl_i16 0, 0)), 0>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
- (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
+ (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg,
+ (cpy_imm8_opt_lsl_i32 0, 0)), 0>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
- (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
+ (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg,
+ (cpy_imm8_opt_lsl_i64 0, 0)), 0>;
def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv8f16 ZPR:$Zd)),
(!cast<Instruction>(NAME # _H) $Zd, $Pg, 0, 0)>;
@@ -5946,16 +5970,20 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
let isWhile = 1;
}
-multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
+multiclass sve2_int_while_rr<bits<1> rw, string asm, SDPatternOperator op> {
def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
- def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>;
- def : SVE_2_Op_Pat<nxv8i1, !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
- def : SVE_2_Op_Pat<nxv4i1, !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
- def : SVE_2_Op_Pat<nxv2i1, !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
+ def : Pat<(nxv16i1 (op i64:$Op1, i64:$Op2, (i64 1))),
+ (!cast<Instruction>(NAME # _B) $Op1, $Op2)>;
+ def : Pat<(nxv8i1 (op i64:$Op1, i64:$Op2, (i64 2))),
+ (!cast<Instruction>(NAME # _H) $Op1, $Op2)>;
+ def : Pat<(nxv4i1 (op i64:$Op1, i64:$Op2, (i64 4))),
+ (!cast<Instruction>(NAME # _S) $Op1, $Op2)>;
+ def : Pat<(nxv2i1 (op i64:$Op1, i64:$Op2, (i64 8))),
+ (!cast<Instruction>(NAME # _D) $Op1, $Op2)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0059a862ba9b..0f2c33585884 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
ModulePass *createAMDGPULowerBufferFatPointersPass();
+ModulePass *createAMDGPULowerIntrinsicsLegacyPass();
FunctionPass *createSIModeRegisterPass();
FunctionPass *createGCNPreRAOptimizationsLegacyPass();
FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
@@ -153,6 +154,16 @@ private:
const TargetMachine &TM;
};
+void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &);
+
+struct AMDGPULowerIntrinsicsPass : PassInfoMixin<AMDGPULowerIntrinsicsPass> {
+ AMDGPULowerIntrinsicsPass(const AMDGPUTargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+private:
+ const AMDGPUTargetMachine &TM;
+};
+
void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
extern char &AMDGPUPrepareAGPRAllocLegacyID;
@@ -490,6 +501,9 @@ extern char &SIModeRegisterID;
void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &);
extern char &AMDGPUInsertDelayAluID;
+void initializeAMDGPULowerVGPREncodingLegacyPass(PassRegistry &);
+extern char &AMDGPULowerVGPREncodingLegacyID;
+
void initializeSIInsertHardClausesLegacyPass(PassRegistry &);
extern char &SIInsertHardClausesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8e4b6365dc06..ffbda14dcd84 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -68,13 +68,15 @@ def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
"FlatGlobalInsts",
"true",
- "Have global_* flat memory instructions"
+ "Have global_* flat memory instructions",
+ [FeatureFlatAddressSpace]
>;
def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
"FlatScratchInsts",
"true",
- "Have scratch_* flat memory instructions"
+ "Have scratch_* flat memory instructions",
+ [FeatureFlatAddressSpace]
>;
def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts",
@@ -92,7 +94,8 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
"FlatGVSMode",
"true",
- "Have GVS addressing mode with flat_* instructions"
+ "Have GVS addressing mode with flat_* instructions",
+ [FeatureFlatAddressSpace]
>;
def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
@@ -286,12 +289,6 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
"VMEM CU scope prefetches do not fail on illegal address"
>;
-def FeatureCUStores : SubtargetFeature<"cu-stores",
- "HasCUStores",
- "true",
- "Whether SCOPE_CU stores can be used on GFX12.5"
->;
-
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -419,6 +416,12 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
"Additional instructions for GFX9+"
>;
+def FeatureRequiresAlignedVGPRs : SubtargetFeature<"vgpr-align2",
+ "RequiresAlignVGPR",
+ "true",
+ "VGPR and AGPR tuple operands require even alignment"
+>;
+
def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
"GFX90AInsts",
"true",
@@ -928,13 +931,15 @@ def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-glo
def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32",
"HasAtomicFMinFMaxF32FlatInsts",
"true",
- "Has flat memory instructions for atomicrmw fmin/fmax for float"
+ "Has flat memory instructions for atomicrmw fmin/fmax for float",
+ [FeatureFlatAddressSpace]
>;
def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64",
"HasAtomicFMinFMaxF64FlatInsts",
"true",
- "Has flat memory instructions for atomicrmw fmin/fmax for double"
+ "Has flat memory instructions for atomicrmw fmin/fmax for double",
+ [FeatureFlatAddressSpace]
>;
def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
@@ -986,7 +991,8 @@ def FeatureFlatAtomicFaddF32Inst
: SubtargetFeature<"flat-atomic-fadd-f32-inst",
"HasFlatAtomicFaddF32Inst",
"true",
- "Has flat_atomic_add_f32 instruction"
+ "Has flat_atomic_add_f32 instruction",
+ [FeatureFlatAddressSpace]
>;
def FeatureFlatBufferGlobalAtomicFaddF64Inst
@@ -1204,6 +1210,12 @@ def Feature64BitLiterals : SubtargetFeature<"64-bit-literals",
"Can use 64-bit literals with single DWORD instructions"
>;
+def Feature1024AddressableVGPRs : SubtargetFeature<"1024-addressable-vgprs",
+ "Has1024AddressableVGPRs",
+ "true",
+ "Has 1024 addressable VGPRs"
+>;
+
def FeatureWaitXcnt : SubtargetFeature<"wait-xcnt",
"HasWaitXcnt",
"true",
@@ -1721,6 +1733,7 @@ def FeatureISAVersion9_0_9 : FeatureSet<
def FeatureISAVersion9_0_A : FeatureSet<
!listconcat(FeatureISAVersion9_0_MI_Common.Features,
[FeatureGFX90AInsts,
+ FeatureRequiresAlignedVGPRs,
FeatureFmacF64Inst,
FeatureDPALU_DPP,
FeaturePackedFP32Ops,
@@ -1743,6 +1756,7 @@ def FeatureISAVersion9_4_Common : FeatureSet<
[FeatureGFX9,
FeatureGFX90AInsts,
FeatureGFX940Insts,
+ FeatureRequiresAlignedVGPRs,
FeatureFmaMixInsts,
FeatureLDSBankCount32,
FeatureDLInsts,
@@ -1894,6 +1908,7 @@ def FeatureISAVersion10_3_Generic: FeatureSet<
def FeatureISAVersion11_Common : FeatureSet<
[FeatureGFX11,
+ FeatureBackOffBarrier,
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureDot5Insts,
@@ -1977,6 +1992,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
+ FeatureBackOffBarrier,
FeatureAddressableLocalMemorySize65536,
FeatureLDSBankCount32,
FeatureDLInsts,
@@ -2019,9 +2035,10 @@ def FeatureISAVersion12 : FeatureSet<
def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
- FeatureCUStores,
+ FeatureRequiresAlignedVGPRs,
FeatureAddressableLocalMemorySize327680,
FeatureCuMode,
+ Feature1024AddressableVGPRs,
Feature64BitLiterals,
FeatureLDSBankCount32,
FeatureDLInsts,
@@ -2830,6 +2847,9 @@ def HasBVHDualAndBVH8Insts : Predicate<"Subtarget->hasBVHDualAndBVH8Insts()">,
def Has64BitLiterals : Predicate<"Subtarget->has64BitLiterals()">,
AssemblerPredicate<(all_of Feature64BitLiterals)>;
+def Has1024AddressableVGPRs : Predicate<"Subtarget->has1024AddressableVGPRs()">,
+ AssemblerPredicate<(all_of Feature1024AddressableVGPRs)>;
+
def HasWaitXcnt : Predicate<"Subtarget->hasWaitXcnt()">,
AssemblerPredicate<(all_of FeatureWaitXcnt)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 36c0d1cbcea2..29f8f9bc8b54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -557,7 +557,6 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
MCContext &Ctx = MF.getContext();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
KernelCodeProperties |=
@@ -587,13 +586,10 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
}
- if (ST.isWave32()) {
+ if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
- if (isGFX1250(ST) && ST.hasCUStores()) {
- KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
- }
// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
// un-evaluatable at this point so it cannot be conditionally checked here.
@@ -638,7 +634,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
(void)PGRM_Rsrc3;
(void)EvaluatableRsrc3;
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
- STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
+ STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 ||
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
@@ -845,7 +841,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
[[maybe_unused]] int64_t PGMRSrc3;
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
- STM.hasGFX90AInsts() ||
+ STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) ||
(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
@@ -1143,9 +1139,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
return SubGPR;
};
-
- ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
- IsaInfo::getSGPREncodingGranule(&STM));
+ // GFX10+ will always allocate 128 SGPRs and this field must be 0
+ if (STM.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ ProgInfo.SGPRBlocks = CreateExpr(0ul);
+ } else {
+ ProgInfo.SGPRBlocks = GetNumGPRBlocks(
+ ProgInfo.NumSGPRsForWavesPerEU, IsaInfo::getSGPREncodingGranule(&STM));
+ }
ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
IsaInfo::getVGPREncodingGranule(&STM));
@@ -1440,9 +1440,10 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
MD->setComputeRegisters(".dynamic_vgpr_en", true);
}
- MD->setHwStage(CC, ".lds_size",
- (unsigned)(CurrentProgramInfo.LdsSize *
- getLdsDwGranularity(ST) * sizeof(uint32_t)));
+ MD->updateHwStageMaximum(
+ CC, ".lds_size",
+ (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
+ sizeof(uint32_t)));
}
// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 59cc1df292f4..f646457f9d76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1296,74 +1296,6 @@ struct AAAMDGPUNoAGPR
const char AAAMDGPUNoAGPR::ID = 0;
-/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
-/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
-/// Both attributes start with narrow ranges that expand during iteration.
-/// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range,
-/// preventing optimal updates later. Therefore, waves-per-eu can't be updated
-/// with intermediate values during the attributor run. We defer the
-/// finalization of waves-per-eu until after the flat-workgroup-size is
-/// finalized.
-/// TODO: Remove this and move similar logic back into the attributor run once
-/// we have a better representation for waves-per-eu.
-static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
- bool Changed = false;
-
- LLVMContext &Ctx = M.getContext();
-
- for (Function &F : M) {
- if (F.isDeclaration())
- continue;
-
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-
- std::optional<std::pair<unsigned, std::optional<unsigned>>>
- FlatWgrpSizeAttr =
- AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
-
- unsigned MinWavesPerEU = ST.getMinWavesPerEU();
- unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
-
- unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize();
- unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize();
- if (FlatWgrpSizeAttr.has_value()) {
- MinFlatWgrpSize = FlatWgrpSizeAttr->first;
- MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
- }
-
- // Start with the "best" range.
- unsigned Min = MinWavesPerEU;
- unsigned Max = MinWavesPerEU;
-
- // Compute the range from flat workgroup size. `getWavesPerEU` will also
- // account for the 'amdgpu-waves-er-eu' attribute.
- auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
- ST.getWavesPerEU(F, {MinFlatWgrpSize, MaxFlatWgrpSize});
-
- // For the lower bound, we have to "tighten" it.
- Min = std::max(Min, MinFromFlatWgrpSize);
- // For the upper bound, we have to "extend" it.
- Max = std::max(Max, MaxFromFlatWgrpSize);
-
- // Clamp the range to the max range.
- Min = std::max(Min, MinWavesPerEU);
- Max = std::min(Max, MaxWavesPerEU);
-
- // Update the attribute if it is not the max.
- if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
- SmallString<10> Buffer;
- raw_svector_ostream OS(Buffer);
- OS << Min << ',' << Max;
- Attribute OldAttr = F.getFnAttribute("amdgpu-waves-per-eu");
- Attribute NewAttr = Attribute::get(Ctx, "amdgpu-waves-per-eu", OS.str());
- F.addFnAttr(NewAttr);
- Changed |= OldAttr == NewAttr;
- }
- }
-
- return Changed;
-}
-
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
AMDGPUAttributorOptions Options,
ThinOrFullLTOPhase LTOPhase) {
@@ -1438,11 +1370,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
}
}
- bool Changed = A.run() == ChangeStatus::CHANGED;
-
- Changed |= updateWavesPerEU(M, TM);
-
- return Changed;
+ return A.run() == ChangeStatus::CHANGED;
}
} // namespace
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index d1a5b4e85da4..21255f691e4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1004,8 +1004,14 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
}
- return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
- AMDGPU::SI_TCRETURN;
+ if (CallerF.getFunction().getCallingConv() ==
+ CallingConv::AMDGPU_Gfx_WholeWave)
+ return AMDGPU::SI_TCRETURN_GFX_WholeWave;
+
+ if (CC == CallingConv::AMDGPU_Gfx || CC == CallingConv::AMDGPU_Gfx_WholeWave)
+ return AMDGPU::SI_TCRETURN_GFX;
+
+ return AMDGPU::SI_TCRETURN;
}
// Add operands to call instruction to track the callee.
@@ -1284,6 +1290,13 @@ bool AMDGPUCallLowering::lowerTailCall(
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), /*IsTailCall*/ true,
ST.isWave32(), CalleeCC, IsDynamicVGPRChainCall);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+
+ if (FuncInfo->isWholeWaveFunction())
+ addOriginalExecToReturn(MF, MIB);
+
+ // Keep track of the index of the next operand to be added to the call
+ unsigned CalleeIdx = MIB->getNumOperands();
+
if (!addCallTargetOperands(MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
return false;
@@ -1401,7 +1414,7 @@ bool AMDGPUCallLowering::lowerTailCall(
// If we have -tailcallopt, we need to adjust the stack. We'll do the call
// sequence start and end here.
if (!IsSibCall) {
- MIB->getOperand(1).setImm(FPDiff);
+ MIB->getOperand(CalleeIdx + 1).setImm(FPDiff);
CallSeqStart.addImm(NumBytes).addImm(0);
// End the call sequence *before* emitting the call. Normally, we would
// tidy the frame up after the call. However, here, we've laid out the
@@ -1413,16 +1426,24 @@ bool AMDGPUCallLowering::lowerTailCall(
// Now we can add the actual call instruction to the correct basic block.
MIRBuilder.insertInstr(MIB);
+ // If this is a whole wave tail call, we need to constrain the register for
+ // the original EXEC.
+ if (MIB->getOpcode() == AMDGPU::SI_TCRETURN_GFX_WholeWave) {
+ MIB->getOperand(0).setReg(
+ constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
+ *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
+ }
+
// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the
// constraint of that instruction.
// FIXME: We should define regbankselectable call instructions to handle
// divergent call targets.
- if (MIB->getOperand(0).isReg()) {
- MIB->getOperand(0).setReg(
- constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
- *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
+ if (MIB->getOperand(CalleeIdx).isReg()) {
+ MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), *MIB, MIB->getDesc(),
+ MIB->getOperand(CalleeIdx), CalleeIdx));
}
MF.getFrameInfo().setHasTailCall();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 394a143dd308..0c112d1787c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -128,12 +128,18 @@ def gi_global_saddr :
def gi_global_saddr_cpol :
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
GIComplexPatternEquiv<GlobalSAddrCPol>;
+def gi_global_saddr_cpol_m0 :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrCPolM0">,
+ GIComplexPatternEquiv<GlobalSAddrCPolM0>;
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
def gi_global_saddr_no_ioffset :
GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
GIComplexPatternEquiv<GlobalSAddrNoIOffset>;
+def gi_global_saddr_no_ioffset_m0 :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffsetM0">,
+ GIComplexPatternEquiv<GlobalSAddrNoIOffsetM0>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b80e43b27129..3785d0f7f268 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2089,6 +2089,23 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
+ SDValue &SAddr,
+ SDValue &VOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
+ return false;
+
+ // We are assuming CPol is second from last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
@@ -2120,6 +2137,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
+ SDValue &SAddr,
+ SDValue &VOffset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ SDValue DummyOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
+ false))
+ return false;
+
+ // We are assuming CPol is second from last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 16388e750026..4fa0d3f72e1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -171,11 +171,16 @@ private:
bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &CPol) const;
+ bool SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &CPol) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c048371b11d7..5c9b616e9bc2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -367,6 +367,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
+ setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
+ setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
+ setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
+
+ setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
+ setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
+ setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
+
+ setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
+ setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
+ setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
+
setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
@@ -411,7 +423,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
Expand);
- setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
+ setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
@@ -1427,8 +1439,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
- case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
- case ISD::FREM: return LowerFREM(Op, DAG);
+ case ISD::SDIVREM:
+ return LowerSDIVREM(Op, DAG);
case ISD::FCEIL: return LowerFCEIL(Op, DAG);
case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
case ISD::FRINT: return LowerFRINT(Op, DAG);
@@ -2423,21 +2435,6 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
return DAG.getMergeValues(Res, DL);
}
-// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
-SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
- SDLoc SL(Op);
- EVT VT = Op.getValueType();
- auto Flags = Op->getFlags();
- SDValue X = Op.getOperand(0);
- SDValue Y = Op.getOperand(1);
-
- SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
- SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
- SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
- // TODO: For f32 use FMAD instead if !hasFastFMA32?
- return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
-}
-
SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -2650,10 +2647,7 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
SDNodeFlags Flags) {
- if (Flags.hasApproximateFuncs())
- return true;
- auto &Options = DAG.getTarget().Options;
- return Options.ApproxFuncFPMath;
+ return Flags.hasApproximateFuncs();
}
bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
@@ -2775,8 +2769,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
const auto &Options = getTargetMachine().Options;
- if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
- Options.ApproxFuncFPMath) {
+ if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
// Log and multiply in f32 is good enough for f16.
@@ -5674,6 +5667,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(TC_RETURN_GFX)
+ NODE_NAME_CASE(TC_RETURN_GFX_WholeWave)
NODE_NAME_CASE(TC_RETURN_CHAIN)
NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
NODE_NAME_CASE(TRAP)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 78394ac9cd2d..bdaf48652d10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -418,6 +418,7 @@ enum NodeType : unsigned {
CALL,
TC_RETURN,
TC_RETURN_GFX,
+ TC_RETURN_GFX_WholeWave,
TC_RETURN_CHAIN,
TC_RETURN_CHAIN_DVGPR,
TRAP,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e305f08925cc..b8fa6f3fc686 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -94,6 +94,10 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
+def AMDGPUtc_return_gfx_ww: SDNode<"AMDGPUISD::TC_RETURN_GFX_WholeWave", AMDGPUTCReturnTP,
+[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5d31eed8fe7d..12915c734442 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1989,39 +1989,6 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
return selectImpl(MI, *CoverageInfo);
}
-bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
- Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
- if (TM.getOptLevel() > CodeGenOptLevel::None) {
- unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
- if (WGSize <= STI.getWavefrontSize()) {
- // If the workgroup fits in a wave, remove s_barrier_signal and lower
- // s_barrier/s_barrier_wait to wave_barrier.
- if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
- IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
- MachineBasicBlock *MBB = MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
- }
- MI.eraseFromParent();
- return true;
- }
- }
-
- if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- MachineBasicBlock *MBB = MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
- .addImm(AMDGPU::Barrier::WORKGROUP);
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
- .addImm(AMDGPU::Barrier::WORKGROUP);
- MI.eraseFromParent();
- return true;
- }
-
- return selectImpl(MI, *CoverageInfo);
-}
-
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
bool &IsTexFail) {
if (TexFailCtrl)
@@ -2338,10 +2305,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, false);
case Intrinsic::amdgcn_init_whole_wave:
return selectInitWholeWave(I);
- case Intrinsic::amdgcn_s_barrier:
- case Intrinsic::amdgcn_s_barrier_signal:
- case Intrinsic::amdgcn_s_barrier_wait:
- return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
@@ -5746,6 +5709,16 @@ AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is second from last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
}
@@ -5762,6 +5735,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
+ MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is second from last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol, false);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
Register PtrBase;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 092439693f39..c760fe7ef99d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -124,7 +124,6 @@ private:
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
bool selectInitWholeWave(MachineInstr &MI) const;
- bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
bool selectImageIntrinsic(MachineInstr &MI,
@@ -257,9 +256,13 @@ private:
InstructionSelector::ComplexRendererFns
selectGlobalSAddrCPol(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrCPolM0(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrNoIOffsetM0(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index efcd87e46620..bd443b5b6f1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -509,6 +509,10 @@ def atomic_load_nonext_64_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_64
let IsAtomic = 1;
}
+def atomic_load_nonext_128_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_128 node:$ptr)> {
+ let IsAtomic = 1;
+}
+
def atomic_load_zext_8_#as : PatFrag<(ops node:$ptr), (atomic_load_zext_8 node:$ptr)> {
let IsAtomic = 1;
}
@@ -573,6 +577,8 @@ def atomic_store_32_#as : PatFrag<(ops node:$val, node:$ptr),
(atomic_store_32 node:$val, node:$ptr)>;
def atomic_store_64_#as : PatFrag<(ops node:$val, node:$ptr),
(atomic_store_64 node:$val, node:$ptr)>;
+def atomic_store_128_#as : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_128 node:$val, node:$ptr)>;
} // End let IsAtomic = 1, AddressSpaces = ...
} // End foreach as
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 600a13096f55..f18536cd4ab9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2082,13 +2082,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
- // TODO: Only Try to form v2s16 with legal packed instructions.
- getActionDefinitionsBuilder(G_FSHR)
- .legalFor({{S32, S32}})
- .lowerFor({{V2S16, V2S16}})
- .clampMaxNumElementsStrict(0, S16, 2)
- .scalarize(0)
- .lower();
+ auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
+ FSHRActionDefs.legalFor({{S32, S32}})
+ .clampMaxNumElementsStrict(0, S16, 2);
+ if (ST.hasVOP3PInsts())
+ FSHRActionDefs.lowerFor({{V2S16, V2S16}});
+ FSHRActionDefs.scalarize(0).lower();
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder(G_FSHL)
@@ -3414,10 +3413,7 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
}
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
- if (Flags & MachineInstr::FmAfn)
- return true;
- const auto &Options = MF.getTarget().Options;
- return Options.ApproxFuncFPMath;
+ return Flags & MachineInstr::FmAfn;
}
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
@@ -3522,8 +3518,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
const AMDGPUTargetMachine &TM =
static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
- if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
- TM.Options.ApproxFuncFPMath) {
+ if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
if (Ty == F16 && !ST.has16BitInsts()) {
Register LogVal = MRI.createGenericVirtualRegister(F32);
auto PromoteSrc = B.buildFPExt(F32, X);
@@ -7823,6 +7818,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.eraseFromParent();
return true;
}
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+ assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+ B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
+ MI.eraseFromParent();
+ return true;
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
+ assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+ B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
+ MI.eraseFromParent();
+ return true;
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
new file mode 100644
index 000000000000..a30d9cb0412a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -0,0 +1,161 @@
+//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower intrinsics that would otherwise require separate handling in both
+// SelectionDAG and GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "amdgpu-lower-intrinsics"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerIntrinsicsImpl {
+public:
+ Module &M;
+ const AMDGPUTargetMachine &TM;
+
+ AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM)
+ : M(M), TM(TM) {}
+
+ bool run();
+
+private:
+ bool visitBarrier(IntrinsicInst &I);
+};
+
+class AMDGPULowerIntrinsicsLegacy : public ModulePass {
+public:
+ static char ID;
+
+ AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ }
+};
+
+template <class T> static void forEachCall(Function &Intrin, T Callback) {
+ for (User *U : make_early_inc_range(Intrin.users())) {
+ if (auto *CI = dyn_cast<IntrinsicInst>(U))
+ Callback(CI);
+ }
+}
+
+} // anonymous namespace
+
+bool AMDGPULowerIntrinsicsImpl::run() {
+ bool Changed = false;
+
+ for (Function &F : M) {
+ switch (F.getIntrinsicID()) {
+ default:
+ continue;
+ case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_s_barrier_signal:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_wait:
+ forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); });
+ break;
+ }
+ }
+
+ return Changed;
+}
+
+// Optimize barriers and lower s_barrier to a sequence of split barrier
+// intrinsics.
+bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
+ assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
+ I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
+ I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
+ I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait);
+
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
+ bool IsSingleWaveWG = false;
+
+ if (TM.getOptLevel() > CodeGenOptLevel::None) {
+ unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
+ IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
+ }
+
+ IRBuilder<> B(&I);
+
+ if (IsSingleWaveWG) {
+ // Down-grade waits, remove split signals.
+ if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
+ I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
+ B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
+ } else if (I.getIntrinsicID() ==
+ Intrinsic::amdgcn_s_barrier_signal_isfirst) {
+ // If we're the only wave of the workgroup, we're always first.
+ I.replaceAllUsesWith(B.getInt1(true));
+ }
+ I.eraseFromParent();
+ return true;
+ }
+
+ if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier &&
+ ST.hasSplitBarriers()) {
+ // Lower to split barriers.
+ Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
+ Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
+ B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
+ {BarrierID_32});
+ B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
+ {BarrierID_16});
+ I.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ AMDGPULowerIntrinsicsImpl Impl(M, TM);
+ if (!Impl.run())
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
+
+ AMDGPULowerIntrinsicsImpl Impl(M, TM);
+ return Impl.run();
+}
+
+#define PASS_DESC "AMDGPU lower intrinsics"
+INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+ false)
+
+char AMDGPULowerIntrinsicsLegacy::ID = 0;
+
+ModulePass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
+ return new AMDGPULowerIntrinsicsLegacy;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
new file mode 100644
index 000000000000..1e6589eb42c1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -0,0 +1,373 @@
+//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Lower VGPRs above first 256 on gfx1250.
+///
+/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
+/// VGPR addressing mode. The mode change is effective until the next change.
+/// This instruction provides high bits of a VGPR address for four of the
+/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
+/// instruction encoding. If bits are set they are added as MSB to the
+/// corresponding operand VGPR number.
+///
+/// There is no need to replace actual register operands because encoding of the
+/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
+/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
+/// VGPRs will survive until actual encoding and will result in a same actual
+/// bit encoding.
+///
+/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
+/// to a VGPR address of the subseqent instructions. The InstPrinter will take
+/// care of the printing a low VGPR instead of a high one. In prinicple this
+/// shall be viable to print actual high VGPR numbers, but that would disagree
+/// with a disasm printing and create a situation where asm text is not
+/// deterministic.
+///
+/// This pass creates a convention where non-fall through basic blocks shall
+/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
+/// An optimization here is possible but deemed not desirable because of the
+/// readbility concerns.
+///
+/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
+/// The pass must run very late in the pipeline to make sure no changes to VGPR
+/// operands will be made after it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULowerVGPREncoding.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/PackedVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"
+
+namespace {
+
+class AMDGPULowerVGPREncoding {
+ static constexpr unsigned OpNum = 4;
+ static constexpr unsigned BitsPerField = 2;
+ static constexpr unsigned NumFields = 4;
+ static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
+ using ModeType = PackedVector<unsigned, BitsPerField,
+ std::bitset<BitsPerField * NumFields>>;
+
+ class ModeTy : public ModeType {
+ public:
+ // bitset constructor will set all bits to zero
+ ModeTy() : ModeType(0) {}
+
+ operator int64_t() const { return raw_bits().to_ulong(); }
+
+ static ModeTy fullMask() {
+ ModeTy M;
+ M.raw_bits().flip();
+ return M;
+ }
+ };
+
+public:
+ bool run(MachineFunction &MF);
+
+private:
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+
+ /// Most recent s_set_* instruction.
+ MachineInstr *MostRecentModeSet;
+
+ /// Whether the current mode is known.
+ bool CurrentModeKnown;
+
+ /// Current mode bits.
+ ModeTy CurrentMode;
+
+ /// Current mask of mode bits that instructions since MostRecentModeSet care
+ /// about.
+ ModeTy CurrentMask;
+
+ /// Number of current hard clause instructions.
+ unsigned ClauseLen;
+
+ /// Number of hard clause instructions remaining.
+ unsigned ClauseRemaining;
+
+ /// Clause group breaks.
+ unsigned ClauseBreaks;
+
+ /// Last hard clause instruction.
+ MachineInstr *Clause;
+
+ /// Insert mode change before \p I. \returns true if mode was changed.
+ bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
+
+ /// Reset mode to default.
+ void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
+
+ /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
+ std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
+
+ /// Handle single \p MI. \return true if changed.
+ bool runOnMachineInstr(MachineInstr &MI);
+
+ /// Compute the mode and mode mask for a single \p MI given \p Ops operands
+ /// bit mapping. Optionally takes second array \p Ops2 for VOPD.
+ /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
+ /// is checked.
+ void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
+ const AMDGPU::OpName Ops[OpNum],
+ const AMDGPU::OpName *Ops2 = nullptr);
+
+ /// Check if an instruction \p I is within a clause and returns a suitable
+ /// iterator to insert mode change. It may also modify the S_CLAUSE
+ /// instruction to extend it or drop the clause if it cannot be adjusted.
+ MachineInstr *handleClause(MachineInstr *I);
+};
+
+bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
+ MachineInstr *I) {
+ assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
+
+ if (CurrentModeKnown) {
+ auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
+
+ if ((Delta & Mask.raw_bits()).none()) {
+ CurrentMask |= Mask;
+ return false;
+ }
+
+ if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
+ CurrentMode |= NewMode;
+ CurrentMask |= Mask;
+
+ MostRecentModeSet->getOperand(0).setImm(CurrentMode);
+ return true;
+ }
+ }
+
+ I = handleClause(I);
+ MostRecentModeSet =
+ BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+ .addImm(NewMode);
+
+ CurrentMode = NewMode;
+ CurrentMask = Mask;
+ CurrentModeKnown = true;
+ return true;
+}
+
+std::optional<unsigned>
+AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
+ if (!MO.isReg())
+ return std::nullopt;
+
+ MCRegister Reg = MO.getReg();
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
+ if (!RC || !TRI->isVGPRClass(RC))
+ return std::nullopt;
+
+ unsigned Idx = TRI->getHWRegIndex(Reg);
+ return Idx >> 8;
+}
+
+void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
+ MachineInstr &MI,
+ const AMDGPU::OpName Ops[OpNum],
+ const AMDGPU::OpName *Ops2) {
+ NewMode = {};
+ Mask = {};
+
+ for (unsigned I = 0; I < OpNum; ++I) {
+ MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
+
+ std::optional<unsigned> MSBits;
+ if (Op)
+ MSBits = getMSBs(*Op);
+
+#if !defined(NDEBUG)
+ if (MSBits.has_value() && Ops2) {
+ auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
+ if (Op2) {
+ std::optional<unsigned> MSBits2;
+ MSBits2 = getMSBs(*Op2);
+ if (MSBits2.has_value() && MSBits != MSBits2)
+ llvm_unreachable("Invalid VOPD pair was created");
+ }
+ }
+#endif
+
+ if (!MSBits.has_value() && Ops2) {
+ Op = TII->getNamedOperand(MI, Ops2[I]);
+ if (Op)
+ MSBits = getMSBs(*Op);
+ }
+
+ if (!MSBits.has_value())
+ continue;
+
+ // Skip tied uses of src2 of VOP2, these will be handled along with defs and
+ // only vdst bit affects these operands. We cannot skip tied uses of VOP3,
+ // these uses are real even if must match the vdst.
+ if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
+ (SIInstrInfo::isVOP2(MI) ||
+ (SIInstrInfo::isVOP3(MI) &&
+ TII->hasVALU32BitEncoding(MI.getOpcode()))))
+ continue;
+
+ NewMode[I] = MSBits.value();
+ Mask[I] = FieldMask;
+ }
+}
+
+bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
+ auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
+ if (Ops.first) {
+ ModeTy NewMode, Mask;
+ computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
+ return setMode(NewMode, Mask, &MI);
+ }
+ assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
+
+ return false;
+}
+
+MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
+ if (!ClauseRemaining)
+ return I;
+
+ // A clause cannot start with a special instruction, place it right before
+ // the clause.
+ if (ClauseRemaining == ClauseLen) {
+ I = Clause->getPrevNode();
+ assert(I->isBundle());
+ return I;
+ }
+
+ // If a clause defines breaks each group cannot start with a mode change.
+ // just drop the clause.
+ if (ClauseBreaks) {
+ Clause->eraseFromBundle();
+ ClauseRemaining = 0;
+ return I;
+ }
+
+ // Otherwise adjust a number of instructions in the clause if it fits.
+ // If it does not clause will just become shorter. Since the length
+ // recorded in the clause is one less, increment the length after the
+ // update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
+ if (ClauseLen < 63)
+ Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));
+
+ ++ClauseLen;
+
+ return I;
+}
+
+bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.has1024AddressableVGPRs())
+ return false;
+
+ TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+
+ bool Changed = false;
+ ClauseLen = ClauseRemaining = 0;
+ CurrentMode.reset();
+ CurrentMask.reset();
+ CurrentModeKnown = true;
+ for (auto &MBB : MF) {
+ MostRecentModeSet = nullptr;
+
+ for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
+ if (MI.isMetaInstruction())
+ continue;
+
+ if (MI.isTerminator() || MI.isCall()) {
+ if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+ MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+ CurrentMode.reset();
+ CurrentModeKnown = true;
+ } else
+ resetMode(&MI);
+ continue;
+ }
+
+ if (MI.isInlineAsm()) {
+ if (TII->hasVGPRUses(MI))
+ resetMode(&MI);
+ continue;
+ }
+
+ if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
+ assert(!ClauseRemaining && "Nested clauses are not supported");
+ ClauseLen = MI.getOperand(0).getImm();
+ ClauseBreaks = (ClauseLen >> 8) & 15;
+ ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
+ Clause = &MI;
+ continue;
+ }
+
+ Changed |= runOnMachineInstr(MI);
+
+ if (ClauseRemaining)
+ --ClauseRemaining;
+ }
+
+ // If we're falling through to a block that has at least one other
+ // predecessor, we no longer know the mode.
+ MachineBasicBlock *Next = MBB.getNextNode();
+ if (Next && Next->pred_size() >= 2 &&
+ llvm::is_contained(Next->predecessors(), &MBB)) {
+ if (CurrentMode.raw_bits().any())
+ CurrentModeKnown = false;
+ }
+ }
+
+ return Changed;
+}
+
+class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ return AMDGPULowerVGPREncoding().run(MF);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // namespace
+
+char AMDGPULowerVGPREncodingLegacy::ID = 0;
+
+char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID;
+
+INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE,
+ "AMDGPU Lower VGPR Encoding", false, false)
+
+PreservedAnalyses
+AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ if (!AMDGPULowerVGPREncoding().run(MF))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h
new file mode 100644
index 000000000000..c8c2051c9fdd
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h
@@ -0,0 +1,25 @@
+//===--- AMDGPULowerVGPREncoding.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPULOWERVGPRENCODING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPULOWERVGPRENCODING_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPULowerVGPREncodingPass
+ : public PassInfoMixin<AMDGPULowerVGPREncodingPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPULOWERVGPRENCODING_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index c84a0f6e3138..6acbf52b97de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -373,6 +373,13 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
MF->getInfo<SIMachineFunctionInfo>(),
*OutStreamer);
+ if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
+ unsigned V = MI->getOperand(0).getImm();
+ OutStreamer->AddComment(
+ " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
+ " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
+ }
+
MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index eda479064d7b..d09b7cffe9f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -23,6 +23,7 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
AgentSSID = CTX.getOrInsertSyncScopeID("agent");
WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
+ ClusterSSID = CTX.getOrInsertSyncScopeID("cluster");
SystemOneAddressSpaceSSID =
CTX.getOrInsertSyncScopeID("one-as");
AgentOneAddressSpaceSSID =
@@ -33,4 +34,5 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
CTX.getOrInsertSyncScopeID("wavefront-one-as");
SingleThreadOneAddressSpaceSSID =
CTX.getOrInsertSyncScopeID("singlethread-one-as");
+ ClusterOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("cluster-one-as");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 5c2ecaa65714..bf852bb38376 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -32,6 +32,8 @@ private:
SyncScope::ID WorkgroupSSID;
/// Wavefront synchronization scope ID (cross address space).
SyncScope::ID WavefrontSSID;
+ /// Cluster synchronization scope ID (cross address space).
+ SyncScope::ID ClusterSSID;
/// System synchronization scope ID (single address space).
SyncScope::ID SystemOneAddressSpaceSSID;
/// Agent synchronization scope ID (single address space).
@@ -42,6 +44,8 @@ private:
SyncScope::ID WavefrontOneAddressSpaceSSID;
/// Single thread synchronization scope ID (single address space).
SyncScope::ID SingleThreadOneAddressSpaceSSID;
+ /// Cluster synchronization scope ID (single address space).
+ SyncScope::ID ClusterOneAddressSpaceSSID;
/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
@@ -60,12 +64,15 @@ private:
else if (SSID == getWorkgroupSSID() ||
SSID == getWorkgroupOneAddressSpaceSSID())
return 2;
+ else if (SSID == getClusterSSID() ||
+ SSID == getClusterOneAddressSpaceSSID())
+ return 3;
else if (SSID == getAgentSSID() ||
SSID == getAgentOneAddressSpaceSSID())
- return 3;
+ return 4;
else if (SSID == SyncScope::System ||
SSID == getSystemOneAddressSpaceSSID())
- return 4;
+ return 5;
return std::nullopt;
}
@@ -73,11 +80,12 @@ private:
/// \returns True if \p SSID is restricted to single address space, false
/// otherwise
bool isOneAddressSpace(SyncScope::ID SSID) const {
- return SSID == getSingleThreadOneAddressSpaceSSID() ||
- SSID == getWavefrontOneAddressSpaceSSID() ||
- SSID == getWorkgroupOneAddressSpaceSSID() ||
- SSID == getAgentOneAddressSpaceSSID() ||
- SSID == getSystemOneAddressSpaceSSID();
+ return SSID == getClusterOneAddressSpaceSSID() ||
+ SSID == getSingleThreadOneAddressSpaceSSID() ||
+ SSID == getWavefrontOneAddressSpaceSSID() ||
+ SSID == getWorkgroupOneAddressSpaceSSID() ||
+ SSID == getAgentOneAddressSpaceSSID() ||
+ SSID == getSystemOneAddressSpaceSSID();
}
public:
@@ -95,6 +103,8 @@ public:
SyncScope::ID getWavefrontSSID() const {
return WavefrontSSID;
}
+ /// \returns Cluster synchronization scope ID (cross address space).
+ SyncScope::ID getClusterSSID() const { return ClusterSSID; }
/// \returns System synchronization scope ID (single address space).
SyncScope::ID getSystemOneAddressSpaceSSID() const {
return SystemOneAddressSpaceSSID;
@@ -115,6 +125,10 @@ public:
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
return SingleThreadOneAddressSpaceSSID;
}
+ /// \returns Single thread synchronization scope ID (single address space).
+ SyncScope::ID getClusterOneAddressSpaceSSID() const {
+ return ClusterOneAddressSpaceSSID;
+ }
/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 6ddfa386e8ac..9449e7093091 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -20,6 +20,7 @@ MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass())
MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
AMDGPULowerBufferFatPointersPass(*this))
+MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this))
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
MODULE_PASS("amdgpu-perf-hint",
@@ -105,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("amdgpu-resource-usage", AMDGPUResourceUsageAnalysis(*
#endif
MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass())
MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("amdgpu-lower-vgpr-encoding", AMDGPULowerVGPREncodingPass())
MACHINE_FUNCTION_PASS("amdgpu-mark-last-scratch-load", AMDGPUMarkLastScratchLoadPass())
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass())
MACHINE_FUNCTION_PASS("amdgpu-reserve-wwm-regs", AMDGPUReserveWWMRegsPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f226c7f381aa..7dbe1235a98b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,7 +70,7 @@ static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
"amdgpu-promote-alloca-to-vector-max-regs",
cl::desc(
"Maximum vector size (in 32b registers) to use when promoting alloca"),
- cl::init(16));
+ cl::init(32));
// Use up to 1/4 of available register budget for vectorization.
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
@@ -287,8 +287,12 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
// Load per function limits, overriding with global options where appropriate.
+ // R600 register tuples/aliasing are fragile with large vector promotions so
+ // apply architecture specific limit here.
+ const int R600MaxVectorRegs = 16;
MaxVectorRegs = F.getFnAttributeAsParsedInteger(
- "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
+ "amdgpu-promote-alloca-to-vector-max-regs",
+ IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
@@ -439,9 +443,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return nullptr;
APInt IndexQuot;
- uint64_t Rem;
- APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
- if (Rem != 0)
+ APInt Rem;
+ APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),
+ IndexQuot, Rem);
+ if (!Rem.isZero())
return nullptr;
if (VarOffsets.size() == 0)
return ConstantInt::get(GEP->getContext(), IndexQuot);
@@ -450,8 +455,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
const auto &VarOffset = VarOffsets.front();
APInt OffsetQuot;
- APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
- if (Rem != 0 || OffsetQuot.isZero())
+ APInt::sdivrem(VarOffset.second,
+ APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot,
+ Rem);
+ if (!Rem.isZero() || OffsetQuot.isZero())
return nullptr;
Value *Offset = VarOffset.first;
@@ -461,7 +468,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
if (!OffsetQuot.isOne()) {
ConstantInt *ConstMul =
- ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
+ ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());
Offset = Builder.CreateMul(Offset, ConstMul);
if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
NewInsts.push_back(NewInst);
@@ -470,8 +477,8 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return Offset;
ConstantInt *ConstIndex =
- ConstantInt::get(OffsetType, IndexQuot.getZExtValue());
- Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
+ ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
+ Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
NewInsts.push_back(NewInst);
return IndexAdd;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 237929699dd9..36b27bef350e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3322,6 +3322,14 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 6); // soffset
return;
}
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(B, MI, 5);
+ return;
+ }
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
applyDefaultMapping(OpdMapper);
@@ -3338,6 +3346,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 8); // M0
return;
+ case Intrinsic::amdgcn_cluster_load_b32:
+ case Intrinsic::amdgcn_cluster_load_b64:
+ case Intrinsic::amdgcn_cluster_load_b128: {
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(B, MI, 4); // M0
+ return;
+ }
case Intrinsic::amdgcn_s_sleep_var:
assert(OpdMapper.getVRegs(1).empty());
constrainOpWithReadfirstlane(B, MI, 1);
@@ -5466,6 +5481,27 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_cluster_load_b32:
+ case Intrinsic::amdgcn_cluster_load_b64:
+ case Intrinsic::amdgcn_cluster_load_b128: {
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned M0Bank =
+ getRegBankID(MI.getOperand(4).getReg(), MRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[4] = AMDGPU::getValueMapping(M0Bank, 32);
+ break;
+ }
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned M0Bank =
+ getRegBankID(MI.getOperand(5).getReg(), MRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[5] = AMDGPU::getValueMapping(M0Bank, 32);
+ break;
+ }
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 8b1d4ba68a44..21cf9cc6878f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -14,6 +14,10 @@
/// MFMA opcode.
///
/// TODO:
+/// - Handle rewrites of phis. This must be more careful than normal about the
+/// reassignment. We do not want to introduce an AGPR-to-AGPR copy inside of a
+/// loop, so it depends on the exact assignment of the copy.
+///
/// - Update LiveIntervals incrementally instead of recomputing from scratch
///
//===----------------------------------------------------------------------===//
@@ -22,6 +26,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -34,6 +39,9 @@ using namespace llvm;
namespace {
+STATISTIC(NumMFMAsRewrittenToAGPR,
+ "Number of MFMA instructions rewritten to use AGPR form");
+
class AMDGPURewriteAGPRCopyMFMAImpl {
MachineFunction &MF;
const GCNSubtarget &ST;
@@ -60,6 +68,25 @@ public:
return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
}
+ /// Find AV_* registers assigned to AGPRs (or virtual registers which were
+ /// already required to be AGPR).
+ ///
+ /// \return the assigned physical register that \p VReg is assigned to if it
+ /// is an AGPR, otherwise MCRegister().
+ MCRegister getAssignedAGPR(Register VReg) const {
+ MCRegister PhysReg = VRM.getPhys(VReg);
+ if (!PhysReg)
+ return MCRegister();
+
+ // If this is an AV register, we have to check if the actual assignment is
+ // to an AGPR
+ const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
+ return TRI.isAGPRClass(AssignedRC) ? PhysReg : MCRegister();
+ }
+
+ bool tryReassigningMFMAChain(MachineInstr &MFMA, Register MFMAHintReg,
+ MCPhysReg PhysRegHint) const;
+
/// Compute the register class constraints based on the uses of \p Reg,
/// excluding MFMA uses from which can be rewritten to change the register
/// class constraint. This should be nearly identical to
@@ -74,6 +101,8 @@ public:
Register Reg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
SmallSetVector<Register, 4> &RewriteRegs) const;
+ bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const;
+ bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const;
bool run(MachineFunction &MF) const;
};
@@ -154,6 +183,88 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
return true;
}
+bool AMDGPURewriteAGPRCopyMFMAImpl::tryReassigningMFMAChain(
+ MachineInstr &MFMA, Register MFMAHintReg, MCPhysReg PhysRegHint) const {
+ // src2 and dst have the same physical class constraint; try to preserve
+ // the original src2 subclass if one were to exist.
+ SmallVector<MachineInstr *, 4> RewriteCandidates = {&MFMA};
+ SmallSetVector<Register, 4> RewriteRegs;
+
+ // Make sure we reassign the MFMA we found the copy from first. We want
+ // to ensure dst ends up in the physreg we were originally copying to.
+ RewriteRegs.insert(MFMAHintReg);
+
+ // We've found av = COPY (MFMA) (or MFMA (v = COPY av)) and need to verify
+ // that we can trivially rewrite src2 to use the new AGPR. If we can't
+ // trivially replace it, we're going to induce as many copies as we would have
+ // emitted in the first place, as well as need to assign another register, and
+ // need to figure out where to put them. The live range splitting is smarter
+ // than anything we're doing here, so trust it did something reasonable.
+ //
+ // Note recomputeRegClassExceptRewritable will consider the constraints of
+ // this MFMA's src2 as well as the src2/dst of any transitive MFMA users.
+ if (!recomputeRegClassExceptRewritable(MFMAHintReg, RewriteCandidates,
+ RewriteRegs)) {
+ LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg "
+ << printReg(MFMAHintReg, &TRI) << '\n');
+ return false;
+ }
+
+ // If src2 and dst are different registers, we need to also reassign the
+ // input to an available AGPR if it is compatible with all other uses.
+ //
+ // If we can't reassign it, we'd need to introduce a different copy
+ // which is likely worse than the copy we'd be saving.
+ //
+ // It's likely that the MFMA is used in sequence with other MFMAs; if we
+ // cannot migrate the full use/def chain of MFMAs, we would need to
+ // introduce intermediate copies somewhere. So we only make the
+ // transform if all the interfering MFMAs can also be migrated. Collect
+ // the set of rewritable MFMAs and check if we can assign an AGPR at
+ // that point.
+ //
+ // If any of the MFMAs aren't reassignable, we give up and rollback to
+ // the original register assignments.
+
+ using RecoloringStack =
+ SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>;
+ RecoloringStack TentativeReassignments;
+
+ for (Register RewriteReg : RewriteRegs) {
+ LiveInterval &LI = LIS.getInterval(RewriteReg);
+ TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)});
+ LRM.unassign(LI);
+ }
+
+ if (!attemptReassignmentsToAGPR(RewriteRegs, PhysRegHint)) {
+ // Roll back the register assignments to the original state.
+ for (auto [LI, OldAssign] : TentativeReassignments) {
+ if (VRM.hasPhys(LI->reg()))
+ LRM.unassign(*LI);
+ LRM.assign(*LI, OldAssign);
+ }
+
+ return false;
+ }
+
+ // Fixup the register classes of the virtual registers now that we've
+ // committed to the reassignments.
+ for (Register InterferingReg : RewriteRegs) {
+ const TargetRegisterClass *EquivalentAGPRRegClass =
+ TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg));
+ MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass);
+ }
+
+ for (MachineInstr *RewriteCandidate : RewriteCandidates) {
+ int NewMFMAOp =
+ AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode());
+ RewriteCandidate->setDesc(TII.get(NewMFMAOp));
+ ++NumMFMAsRewrittenToAGPR;
+ }
+
+ return true;
+}
+
/// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a
/// preference to use \p PhysReg first. Returns false if the reassignments
/// cannot be trivially performed.
@@ -206,140 +317,104 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::attemptReassignmentsToAGPR(
return true;
}
-bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
- // This only applies on subtargets that have a configurable AGPR vs. VGPR
- // allocation.
- if (!ST.hasGFX90AInsts())
- return false;
-
- // Early exit if no AGPRs were assigned.
- if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) {
- LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n");
- return false;
- }
-
+/// Identify copies that look like:
+/// %vdst:vgpr = V_MFMA_.. %src0:av, %src1:av, %src2:vgpr
+/// %agpr = COPY %vgpr
+///
+/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR
+/// versions of the MFMA. This should cover the common case.
+bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesToAGPR(
+ Register VReg, MCRegister AssignedAGPR) const {
bool MadeChange = false;
-
- for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- Register VReg = Register::index2VirtReg(I);
- Register PhysReg = VRM.getPhys(VReg);
- if (!PhysReg)
+ for (MachineInstr &UseMI : MRI.def_instructions(VReg)) {
+ if (!UseMI.isCopy())
continue;
- // Find AV_* registers assigned to AGPRs.
- const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg);
- if (!TRI.hasAGPRs(VirtRegRC))
+ Register CopySrcReg = UseMI.getOperand(1).getReg();
+ if (!CopySrcReg.isVirtual())
continue;
- const TargetRegisterClass *AssignedRC = VirtRegRC;
- if (TRI.hasVGPRs(VirtRegRC)) {
- // If this is an AV register, we have to check if the actual assignment is
- // to an AGPR
- AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
- if (!TRI.isAGPRClass(AssignedRC))
- continue;
+ // TODO: Handle loop phis copied to AGPR. e.g.
+ //
+ // loop:
+ // %phi:vgpr = COPY %mfma:vgpr
+ // %mfma:vgpr = V_MFMA_xxx_vgprcd_e64 %a, %b, %phi
+ // s_cbranch_vccnz loop
+ //
+ // endloop:
+ // %agpr = mfma
+ //
+ // We need to be sure that %phi is assigned to the same physical register as
+ // %mfma, or else we will just be moving copies into the loop.
+
+ for (MachineInstr &CopySrcDefMI : MRI.def_instructions(CopySrcReg)) {
+ if (isRewriteCandidate(CopySrcDefMI) &&
+ tryReassigningMFMAChain(
+ CopySrcDefMI, CopySrcDefMI.getOperand(0).getReg(), AssignedAGPR))
+ MadeChange = true;
}
+ }
- LiveInterval &LI = LIS.getInterval(VReg);
-
- for (VNInfo *VNI : LI.vnis()) {
- if (VNI->isPHIDef() || VNI->isUnused())
- continue;
-
- MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
- if (!DefMI || !DefMI->isCopy())
- continue;
+ return MadeChange;
+}
- Register MFMADstReg = DefMI->getOperand(1).getReg();
- if (!MFMADstReg.isVirtual())
- continue;
+/// Identify copies that look like:
+/// %src:vgpr = COPY %src:agpr
+/// %vdst:vgpr = V_MFMA_... %src0:av, %src1:av, %src:vgpr
+///
+/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR
+/// versions of the MFMA. This should cover rarer cases, and will generally be
+/// redundant with tryFoldCopiesToAGPR.
+bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR(
+ Register VReg, MCRegister AssignedAGPR) const {
+ bool MadeChange = false;
+ for (MachineInstr &UseMI : MRI.use_instructions(VReg)) {
+ if (!UseMI.isCopy())
+ continue;
- LiveInterval &CopySrcLI = LIS.getInterval(MFMADstReg);
- LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot());
- MachineInstr *MFMA = LIS.getInstructionFromIndex(LRQ.valueIn()->def);
- if (!MFMA || !isRewriteCandidate(*MFMA))
+ Register CopyDstReg = UseMI.getOperand(0).getReg();
+ if (!CopyDstReg.isVirtual())
+ continue;
+ for (MachineOperand &CopyUseMO : MRI.reg_nodbg_operands(CopyDstReg)) {
+ if (!CopyUseMO.readsReg())
continue;
- // src2 and dst have the same physical class constraint; try to preserve
- // the original src2 subclass if one were to exist.
- SmallVector<MachineInstr *, 4> RewriteCandidates = {MFMA};
- SmallSetVector<Register, 4> RewriteRegs;
-
- // Make sure we reassign the MFMA we found the copy from first. We want
- // to ensure dst ends up in the physreg we were originally copying to.
- RewriteRegs.insert(MFMADstReg);
-
- // We've found av = COPY (MFMA), and need to verify that we can trivially
- // rewrite src2 to use the new AGPR. If we can't trivially replace it,
- // we're going to induce as many copies as we would have emitted in the
- // first place, as well as need to assign another register, and need to
- // figure out where to put them. The live range splitting is smarter than
- // anything we're doing here, so trust it did something reasonable.
- //
- // Note recomputeRegClassExceptRewritable will consider the constraints of
- // this MFMA's src2 as well as the src2/dst of any transitive MFMA users.
- if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates,
- RewriteRegs)) {
- LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg "
- << printReg(MFMADstReg, &TRI) << '\n');
- continue;
+ MachineInstr &CopyUseMI = *CopyUseMO.getParent();
+ if (isRewriteCandidate(CopyUseMI)) {
+ if (tryReassigningMFMAChain(CopyUseMI, CopyDstReg,
+ VRM.getPhys(CopyDstReg)))
+ MadeChange = true;
}
+ }
+ }
- // If src2 and dst are different registers, we need to also reassign the
- // input to an available AGPR if it is compatible with all other uses.
- //
- // If we can't reassign it, we'd need to introduce a different copy
- // which is likely worse than the copy we'd be saving.
- //
- // It's likely that the MFMA is used in sequence with other MFMAs; if we
- // cannot migrate the full use/def chain of MFMAs, we would need to
- // introduce intermediate copies somewhere. So we only make the
- // transform if all the interfering MFMAs can also be migrated. Collect
- // the set of rewritable MFMAs and check if we can assign an AGPR at
- // that point.
- //
- // If any of the MFMAs aren't reassignable, we give up and rollback to
- // the original register assignments.
-
- using RecoloringStack =
- SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>;
- RecoloringStack TentativeReassignments;
-
- for (Register RewriteReg : RewriteRegs) {
- LiveInterval &LI = LIS.getInterval(RewriteReg);
- TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)});
- LRM.unassign(LI);
- }
+ return MadeChange;
+}
- if (!attemptReassignmentsToAGPR(RewriteRegs, PhysReg)) {
- // Roll back the register assignments to the original state.
- for (auto [LI, OldAssign] : TentativeReassignments) {
- if (VRM.hasPhys(LI->reg()))
- LRM.unassign(*LI);
- LRM.assign(*LI, OldAssign);
- }
+bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
+ // This only applies on subtargets that have a configurable AGPR vs. VGPR
+ // allocation.
+ if (!ST.hasGFX90AInsts())
+ return false;
- continue;
- }
+ // Early exit if no AGPRs were assigned.
+ if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) {
+ LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n");
+ return false;
+ }
- // Fixup the register classes of the virtual registers now that we've
- // committed to the reassignments.
- for (Register InterferingReg : RewriteRegs) {
- const TargetRegisterClass *EquivalentAGPRRegClass =
- TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg));
- MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass);
- }
+ bool MadeChange = false;
- for (MachineInstr *RewriteCandidate : RewriteCandidates) {
- int NewMFMAOp =
- AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode());
- RewriteCandidate->setDesc(TII.get(NewMFMAOp));
- }
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ Register VReg = Register::index2VirtReg(I);
+ MCRegister AssignedAGPR = getAssignedAGPR(VReg);
+ if (!AssignedAGPR)
+ continue;
- // We likely left an identity copy behind after assignment; let
- // VirtRegRewriter deal with it later.
+ if (tryFoldCopiesToAGPR(VReg, AssignedAGPR))
+ MadeChange = true;
+ if (tryFoldCopiesFromAGPR(VReg, AssignedAGPR))
MadeChange = true;
- }
}
return MadeChange;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d095fc6cf954..73acb1ddbd2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -210,18 +210,10 @@ AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
// Default/requested minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
// Minimum number of bytes allocated in the LDS.
- unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
- {0, UINT32_MAX}, true)
- .first;
- return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
-}
-
-std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
- const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
- // Minimum number of bytes allocated in the LDS.
- unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
- {0, UINT32_MAX}, true)
- .first;
+ unsigned LDSBytes =
+ AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", {0, UINT32_MAX},
+ /*OnlyFirstRequired=*/true)
+ .first;
return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
}
@@ -237,11 +229,31 @@ AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes);
}
-static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
+std::optional<unsigned>
+AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,
+ unsigned Dim) const {
auto *Node = Kernel.getMetadata("reqd_work_group_size");
if (Node && Node->getNumOperands() == 3)
return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
- return std::numeric_limits<unsigned>::max();
+ return std::nullopt;
+}
+
+bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim(
+ const Function &F, bool RequiresUniformYZ) const {
+ auto *Node = F.getMetadata("reqd_work_group_size");
+ if (!Node || Node->getNumOperands() != 3)
+ return false;
+ unsigned XLen =
+ mdconst::extract<ConstantInt>(Node->getOperand(0))->getZExtValue();
+ unsigned YLen =
+ mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue();
+ unsigned ZLen =
+ mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();
+
+ bool Is1D = YLen <= 1 && ZLen <= 1;
+ bool IsXLargeEnough =
+ isPowerOf2_32(XLen) && (!RequiresUniformYZ || XLen >= getWavefrontSize());
+ return Is1D || IsXLargeEnough;
}
bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
@@ -250,9 +262,9 @@ bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
unsigned Dimension) const {
- unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
- if (ReqdSize != std::numeric_limits<unsigned>::max())
- return ReqdSize - 1;
+ std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
+ if (ReqdSize)
+ return *ReqdSize - 1;
return getFlatWorkGroupSizes(Kernel).second - 1;
}
@@ -303,9 +315,9 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
}
if (Dim <= 3) {
- unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
- if (ReqdSize != std::numeric_limits<unsigned>::max())
- MinSize = MaxSize = ReqdSize;
+ std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
+ if (ReqdSize)
+ MinSize = MaxSize = *ReqdSize;
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 6878744496cf..57b757c990e1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -100,6 +100,26 @@ public:
/// be converted to integer, or violate subtarget's specifications.
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
+ /// \returns The required size of workgroups that will be used to execute \p F
+ /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
+ /// metadata. Otherwise, returns std::nullopt.
+ std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
+ unsigned Dim) const;
+
+ /// \returns true if \p F will execute in a manner that leaves the X
+ /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
+ /// wavefrontsize is uniform. This is true if either the Y and Z block
+ /// dimensions are known to always be 1 or if the X dimension will always be a
+ /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
+ /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
+ /// wavesize64 would ordinarily pass this test, it won't with
+ /// \pRequiresUniformYZ).
+ ///
+ /// This information is currently only gathered from the !reqd_work_group_size
+ /// metadata on \p F, but this may be improved in the future.
+ bool hasWavefrontsEvenlySplittingXDim(const Function &F,
+ bool REquiresUniformYZ = false) const;
+
/// \returns Subtarget's default pair of minimum/maximum number of waves per
/// execution unit for function \p F, or minimum/maximum number of waves per
/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e969f9ec8889..9afe7590fe4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -22,6 +22,7 @@
#include "AMDGPUExportKernelRuntimeHandles.h"
#include "AMDGPUIGroupLP.h"
#include "AMDGPUISelDAGToDAG.h"
+#include "AMDGPULowerVGPREncoding.h"
#include "AMDGPUMacroFusion.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUPreloadKernArgProlog.h"
@@ -577,12 +578,14 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
+ initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeSIAnnotateControlFlowLegacyPass(*PR);
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
+ initializeAMDGPULowerVGPREncodingLegacyPass(*PR);
initializeSIInsertHardClausesLegacyPass(*PR);
initializeSIInsertWaitcntsLegacyPass(*PR);
initializeSIModeRegisterLegacyPass(*PR);
@@ -1418,6 +1421,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
+ addPass(createAMDGPULowerIntrinsicsLegacyPass());
// In accordance with the above FIXME, manually force all the
// function-level passes into a CGSCCPassManager.
addPass(new DummyCGSCCPass());
@@ -1797,6 +1801,8 @@ void GCNPassConfig::addPreEmitPass() {
addPass(&AMDGPUWaitSGPRHazardsLegacyID);
+ addPass(&AMDGPULowerVGPREncodingLegacyID);
+
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
addPass(&AMDGPUInsertDelayAluID);
@@ -2155,9 +2161,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));
-
addPass.requireCGSCCOrder();
+ addPass(AMDGPULowerIntrinsicsPass(TM));
+
Base::addCodeGenPrepare(addPass);
if (isPassEnabled(EnableLoadStoreVectorizer))
@@ -2383,6 +2390,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
// cases.
addPass(PostRAHazardRecognizerPass());
addPass(AMDGPUWaitSGPRHazardsPass());
+ addPass(AMDGPULowerVGPREncodingPass());
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) {
addPass(AMDGPUInsertDelayAluPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 846a0b6280f1..3e2b2c351056 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -22,6 +22,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
@@ -1003,6 +1004,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
DstAS == AMDGPUAS::FLAT_ADDRESS &&
ST->hasGloballyAddressableScratch();
}
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::amdgcn_workitem_id_z: {
+ const Function *F = Intrinsic->getFunction();
+ bool HasUniformYZ =
+ ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
+ std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
+ *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
+ return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
+ }
default:
return AMDGPU::isIntrinsicSourceOfDivergence(IID);
}
@@ -1049,28 +1059,31 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
// packed into a same wave which gives 1 and 0 after the division by 64
// respectively.
//
- // FIXME: limit it to 1D kernels only, although that shall be possible
- // to perform this optimization is the size of the X dimension is a power
- // of 2, we just do not currently have infrastructure to query it.
+ // The X dimension doesn't reset within a wave if either both the Y
+ // and Z dimensions are of length 1, or if the X dimension's required
+ // size is a power of 2. Note, however, if the X dimension's maximum
+ // size is a power of 2 < the wavefront size, division by the wavefront
+ // size is guaranteed to yield 0, so this is also a no-reset case.
+ bool XDimDoesntResetWithinWaves = false;
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ const Function *F = I->getFunction();
+ XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
+ }
using namespace llvm::PatternMatch;
uint64_t C;
if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_ConstantInt(C))) ||
match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_ConstantInt(C)))) {
- const Function *F = cast<Instruction>(V)->getFunction();
- return C >= ST->getWavefrontSizeLog2() &&
- ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+ return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
}
Value *Mask;
if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_Value(Mask)))) {
- const Function *F = cast<Instruction>(V)->getFunction();
- const DataLayout &DL = F->getDataLayout();
return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
ST->getWavefrontSizeLog2() &&
- ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+ XDimDoesntResetWithinWaves;
}
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2e21ba4c30b5..e420f2ad676f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -564,6 +564,14 @@ public:
return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
}
+ bool isVCSrc_b32_Lo256() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_32_Lo256RegClassID, MVT::i32);
+ }
+
+ bool isVCSrc_b64_Lo256() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_64_Lo256RegClassID, MVT::i64);
+ }
+
bool isVCSrc_b64() const {
return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
}
@@ -1007,7 +1015,7 @@ public:
bool isEndpgm() const;
auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const {
- return [=](){ return P(*this); };
+ return [this, P]() { return P(*this); };
}
StringRef getToken() const {
@@ -1886,6 +1894,7 @@ private:
bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands,
const unsigned CPol);
bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
+ bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands);
std::optional<StringRef> validateLdsDirect(const MCInst &Inst);
bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
unsigned getConstantBusLimit(unsigned Opcode) const;
@@ -2985,7 +2994,12 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
const MCRegisterClass RC = TRI->getRegClass(RCID);
- if (RegIdx >= RC.getNumRegs()) {
+ if (RegIdx >= RC.getNumRegs() || (RegKind == IS_VGPR && RegIdx > 255)) {
+ Error(Loc, "register index is out of range");
+ return AMDGPU::NoRegister;
+ }
+
+ if (RegKind == IS_VGPR && !isGFX1250() && RegIdx + RegWidth / 32 > 256) {
Error(Loc, "register index is out of range");
return MCRegister();
}
@@ -4768,12 +4782,14 @@ bool AMDGPUAsmParser::validateOffset(const MCInst &Inst,
return validateSMEMOffset(Inst, Operands);
const auto &Op = Inst.getOperand(OpNum);
+ // GFX12+ buffer ops: InstOffset is signed 24, but must not be a negative.
if (isGFX12Plus() &&
(TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF))) {
const unsigned OffsetSize = 24;
- if (!isIntN(OffsetSize, Op.getImm())) {
+ if (!isUIntN(OffsetSize - 1, Op.getImm())) {
Error(getFlatOffsetLoc(Operands),
- Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset");
+ Twine("expected a ") + Twine(OffsetSize - 1) +
+ "-bit unsigned offset for buffer ops");
return false;
}
} else {
@@ -4856,7 +4872,9 @@ bool AMDGPUAsmParser::validateSMEMOffset(const MCInst &Inst,
return true;
Error(getSMEMOffsetLoc(Operands),
- isGFX12Plus() ? "expected a 24-bit signed offset"
+ isGFX12Plus() && IsBuffer
+ ? "expected a 23-bit unsigned offset for buffer ops"
+ : isGFX12Plus() ? "expected a 24-bit signed offset"
: (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset"
: "expected a 21-bit signed offset");
@@ -5216,7 +5234,7 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
auto FB = getFeatureBits();
- if (!FB[AMDGPU::FeatureGFX90AInsts] && !FB[AMDGPU::FeatureGFX1250Insts])
+ if (!FB[AMDGPU::FeatureRequiresAlignedVGPRs])
return true;
unsigned Opc = Inst.getOpcode();
@@ -5542,6 +5560,22 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
return true;
}
+bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst,
+ const OperandVector &Operands) {
+ if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12)
+ return true;
+
+ int Simm16Pos =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16);
+ if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) {
+ SMLoc Loc = Operands[1]->getStartLoc();
+ Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]");
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
const OperandVector &Operands) {
unsigned Opc = Inst.getOpcode();
@@ -5706,6 +5740,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateTFE(Inst, Operands)) {
return false;
}
+ if (!validateSetVgprMSB(Inst, Operands)) {
+ return false;
+ }
if (!validateWMMA(Inst, Operands)) {
return false;
}
@@ -5799,6 +5836,7 @@ bool AMDGPUAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
MCInst Inst;
+ Inst.setLoc(IDLoc);
unsigned Result = Match_Success;
for (auto Variant : getMatchedVariants()) {
uint64_t EI;
@@ -5822,7 +5860,6 @@ bool AMDGPUAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
if (!validateInstruction(Inst, IDLoc, Operands)) {
return true;
}
- Inst.setLoc(IDLoc);
Out.emitInstruction(Inst, getSTI());
return false;
}
@@ -6144,12 +6181,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
ExprVal, ValRange);
if (Val)
ImpliedUserSGPRCount += 1;
- } else if (ID == ".amdhsa_uses_cu_stores") {
- if (!isGFX1250())
- return Error(IDRange.Start, "directive requires gfx12.5", IDRange);
-
- PARSE_BITS_ENTRY(KD.kernel_code_properties,
- KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange);
} else if (ID == ".amdhsa_wavefront_size32") {
EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
if (IVersion.Major < 10)
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 1956a15c57d6..f229298ba516 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -132,7 +132,6 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let SchedRW = ps.SchedRW;
let mayLoad = ps.mayLoad;
@@ -159,11 +158,10 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
}
-class getMTBUFInsDA<list<RegisterClass> vdataList,
+class getMTBUFInsDA<list<RegisterOperand> vdataList,
list<RegisterClass> vaddrList=[], bit hasRestrictedSOffset> {
- RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+ RegisterOperand vdata_op = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
- RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset),
(ins SCSrc_b32:$soffset));
@@ -179,7 +177,7 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
!con((ins vdata_op:$vdata), Inputs));
}
-class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit hasRestrictedSOffset> {
+class getMTBUFIns<int addrKind, list<RegisterOperand> vdataList=[], bit hasRestrictedSOffset> {
dag ret =
!if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList, [], hasRestrictedSOffset>.ret,
!if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32], hasRestrictedSOffset>.ret,
@@ -218,25 +216,23 @@ class MTBUF_SetupAddr<int addrKind> {
class MTBUF_Load_Pseudo <string opName,
int addrKind,
- RegisterClass vdataClass,
+ RegisterOperand vdataClass,
int elems,
bit hasRestrictedSOffset = 0,
- list<dag> pattern=[],
- // Workaround bug bz30254
- int addrKindCopy = addrKind>
+ list<dag> pattern=[]>
: MTBUF_Pseudo<opName,
- (outs getLdStRegisterOperand<vdataClass>.ret:$vdata),
- getMTBUFIns<addrKindCopy, [], hasRestrictedSOffset>.ret,
- getMTBUFAsmOps<addrKindCopy>.ret,
+ (outs vdataClass:$vdata),
+ getMTBUFIns<addrKind, [], hasRestrictedSOffset>.ret,
+ getMTBUFAsmOps<addrKind>.ret,
pattern>,
- MTBUF_SetupAddr<addrKindCopy> {
- let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ MTBUF_SetupAddr<addrKind> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKind>.ret;
let mayLoad = 1;
let mayStore = 0;
let elements = elems;
}
-multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterClass vdataClass,
+multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterOperand vdataClass,
int elems, bit hasRestrictedSOffset> {
def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasRestrictedSOffset>,
@@ -257,7 +253,7 @@ multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterClass vdataClass,
}
}
-multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterOperand vdataClass,
int elems> {
defm NAME : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 0>;
defm _VBUFFER : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 1>;
@@ -265,26 +261,23 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
class MTBUF_Store_Pseudo <string opName,
int addrKind,
- RegisterClass vdataClass,
+ RegisterOperand vdataClass,
int elems,
bit hasRestrictedSOffset = 0,
- list<dag> pattern=[],
- // Workaround bug bz30254
- int addrKindCopy = addrKind,
- RegisterClass vdataClassCopy = vdataClass>
+ list<dag> pattern=[]>
: MTBUF_Pseudo<opName,
(outs),
- getMTBUFIns<addrKindCopy, [vdataClassCopy], hasRestrictedSOffset>.ret,
- getMTBUFAsmOps<addrKindCopy>.ret,
+ getMTBUFIns<addrKind, [vdataClass], hasRestrictedSOffset>.ret,
+ getMTBUFAsmOps<addrKind>.ret,
pattern>,
- MTBUF_SetupAddr<addrKindCopy> {
- let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ MTBUF_SetupAddr<addrKind> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKind>.ret;
let mayLoad = 0;
let mayStore = 1;
let elements = elems;
}
-multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterClass vdataClass,
+multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterOperand vdataClass,
int elems, bit hasRestrictedSOffset> {
def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasRestrictedSOffset>,
@@ -305,7 +298,7 @@ multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterClass vdataClass,
}
}
-multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterOperand vdataClass,
int elems> {
defm NAME : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 0>;
defm _VBUFFER : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 1>;
@@ -346,7 +339,6 @@ class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
let AsmMatchConverter = ps.AsmMatchConverter;
let OtherPredicates = ps.OtherPredicates;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let SchedRW = ps.SchedRW;
@@ -401,21 +393,29 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
let sccb_value = 0;
}
-class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> {
- RegisterOperand tfeVDataOp =
- !cond(!eq(RC.Size, 32) : AVLdSt_64,
- !eq(RC.Size, 64) : AVLdSt_96,
- !eq(RC.Size, 96) : AVLdSt_128,
- !eq(RC.Size, 128) : AVLdSt_160);
+class getBUFVDataRegisterOperand<int Size, bit isTFE> {
+ defvar tfeVDataOp =
+ !cond(!eq(Size, 16) : AVLdSt_64,
+ !eq(Size, 32) : AVLdSt_64,
+ !eq(Size, 64) : AVLdSt_96,
+ !eq(Size, 96) : AVLdSt_128,
+ !eq(Size, 128) : AVLdSt_160);
+
+ defvar VDataOp =
+ !cond(!eq(Size, 16) : AVLdSt_32,
+ !eq(Size, 32) : AVLdSt_32,
+ !eq(Size, 64) : AVLdSt_64,
+ !eq(Size, 96) : AVLdSt_96,
+ !eq(Size, 128) : AVLdSt_128);
- RegisterOperand ret = !if(isTFE, tfeVDataOp, getLdStRegisterOperand<RC>.ret);
+ RegisterOperand ret = !if(isTFE, tfeVDataOp, VDataOp);
}
-class getMUBUFInsDA<list<RegisterClass> vdataList,
+class getMUBUFInsDA<list<RegisterOperand> vdataList,
list<RegisterClass> vaddrList, bit isTFE, bit hasRestrictedSOffset> {
- RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+ RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
- RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdataClass, isTFE>.ret;
+ RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdataClass.RegClass.Size, isTFE>.ret;
dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
@@ -443,7 +443,7 @@ class getMUBUFElements<ValueType vt> {
);
}
-class getMUBUFIns<int addrKind, list<RegisterClass> vdataList, bit isTFE, bit hasRestrictedSOffset> {
+class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset> {
dag ret =
!if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret,
!if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
@@ -491,19 +491,16 @@ class MUBUF_Load_Pseudo <string opName,
bit isTFE = 0,
bit hasRestrictedSOffset = 0,
list<dag> pattern=[],
- // Workaround bug bz30254
- int addrKindCopy = addrKind,
- RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret.RegClass,
- RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret>
+ RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdata_vt.Size, isTFE>.ret>
: MUBUF_Pseudo<opName,
!if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
- !con(getMUBUFIns<addrKindCopy, [], isTFE, hasRestrictedSOffset>.ret,
+ !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset>.ret,
!if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
- getMUBUFAsmOps<addrKindCopy, !or(isLds, isLdsOpc), isLds, isTFE>.ret,
+ getMUBUFAsmOps<addrKind, !or(isLds, isLdsOpc), isLds, isTFE>.ret,
pattern>,
- MUBUF_SetupAddr<addrKindCopy> {
+ MUBUF_SetupAddr<addrKind> {
let PseudoInstr = opName # !if(isLds, "_lds", "") # !if(isTFE, "_tfe", "") #
- "_" # getAddrName<addrKindCopy>.ret;
+ "_" # getAddrName<addrKind>.ret;
let AsmMatchConverter = "cvtMubuf";
let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
@@ -593,17 +590,15 @@ class MUBUF_Store_Pseudo <string opName,
ValueType store_vt,
bit isTFE = 0,
bit hasRestrictedSOffset = 0,
- list<dag> pattern=[],
- // Workaround bug bz30254
- int addrKindCopy = addrKind>
+ list<dag> pattern=[]>
: MUBUF_Pseudo<opName,
(outs),
- getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret.RegClass], isTFE, hasRestrictedSOffset>.ret,
- getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret,
+ getMUBUFIns<addrKind, [getVregSrcForVT<store_vt>.ret], isTFE, hasRestrictedSOffset>.ret,
+ getMUBUFAsmOps<addrKind, 0, 0, isTFE>.ret,
pattern>,
- MUBUF_SetupAddr<addrKindCopy> {
+ MUBUF_SetupAddr<addrKind> {
let PseudoInstr = opName # "_" # !if(isTFE, "_tfe", "") #
- getAddrName<addrKindCopy>.ret;
+ getAddrName<addrKind>.ret;
let mayLoad = 0;
let mayStore = 1;
let elements = getMUBUFElements<store_vt>.ret;
@@ -676,10 +671,9 @@ class MUBUF_Pseudo_Store_Lds<string opName>
let AsmMatchConverter = "cvtMubuf";
}
-class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasRestrictedSOffset,
+class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset,
list<RegisterClass> vaddrList=[]> {
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
- RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata));
dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr)));
@@ -692,22 +686,20 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasRestric
}
class getMUBUFAtomicIns<int addrKind,
- RegisterClass vdataClass,
+ RegisterOperand vdataClass,
bit vdata_in,
- bit hasRestrictedSOffset,
- // Workaround bug bz30254
- RegisterClass vdataClassCopy=vdataClass> {
+ bit hasRestrictedSOffset> {
dag ret =
!if(!eq(addrKind, BUFAddrKind.Offset),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset>.ret,
!if(!eq(addrKind, BUFAddrKind.OffEn),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
!if(!eq(addrKind, BUFAddrKind.IdxEn),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
!if(!eq(addrKind, BUFAddrKind.BothEn),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
!if(!eq(addrKind, BUFAddrKind.Addr64),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
+ getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
(ins))))));
}
@@ -716,11 +708,9 @@ class MUBUF_Atomic_Pseudo<string opName,
dag outs,
dag ins,
string asmOps,
- list<dag> pattern=[],
- // Workaround bug bz30254
- int addrKindCopy = addrKind>
+ list<dag> pattern=[]>
: MUBUF_Pseudo<opName, outs, ins, asmOps, pattern>,
- MUBUF_SetupAddr<addrKindCopy> {
+ MUBUF_SetupAddr<addrKind> {
let mayStore = 1;
let mayLoad = 1;
let hasSideEffects = 1;
@@ -732,18 +722,15 @@ class MUBUF_Atomic_Pseudo<string opName,
}
class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
- RegisterClass vdataClass,
+ RegisterOperand vdataClass,
bit hasRestrictedSOffset = 0,
- list<dag> pattern=[],
- // Workaround bug bz30254
- int addrKindCopy = addrKind,
- RegisterClass vdataClassCopy = vdataClass>
- : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+ list<dag> pattern=[]>
+ : MUBUF_Atomic_Pseudo<opName, addrKind,
(outs),
- getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0, hasRestrictedSOffset>.ret,
- getMUBUFAsmOps<addrKindCopy>.ret,
+ getMUBUFAtomicIns<addrKind, vdataClass, 0, hasRestrictedSOffset>.ret,
+ getMUBUFAsmOps<addrKind>.ret,
pattern> {
- let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+ let PseudoInstr = opName # "_" # getAddrName<addrKind>.ret;
let glc_value = 0;
let dlc_value = 0;
let sccb_value = 0;
@@ -751,29 +738,24 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
}
class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
- RegisterClass vdataClass,
+ RegisterOperand vdata_op,
bit hasRestrictedSOffset = 0,
- list<dag> pattern=[],
- // Workaround bug bz30254
- int addrKindCopy = addrKind,
- RegisterClass vdataClassCopy = vdataClass,
- RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret>
- : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+ list<dag> pattern=[]>
+ : MUBUF_Atomic_Pseudo<opName, addrKind,
(outs vdata_op:$vdata),
- getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1, hasRestrictedSOffset>.ret,
- getMUBUFAsmOps<addrKindCopy>.ret,
+ getMUBUFAtomicIns<addrKind, vdata_op, 1, hasRestrictedSOffset>.ret,
+ getMUBUFAsmOps<addrKind>.ret,
pattern> {
- let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
+ let PseudoInstr = opName # "_rtn_" # getAddrName<addrKind>.ret;
let glc_value = 1;
let dlc_value = 0;
let sccb_value = 0;
let IsAtomicRet = 1;
let Constraints = "$vdata = $vdata_in";
- let DisableEncoding = "$vdata_in";
}
multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
- RegisterClass vdataClass,
+ RegisterOperand vdataClass,
ValueType vdataType> {
let FPAtomic = vdataType.isFP in {
def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
@@ -795,7 +777,7 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
}
multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
- RegisterClass vdataClass,
+ RegisterOperand vdataClass,
ValueType vdataType,
SDPatternOperator atomic> {
let FPAtomic = vdataType.isFP in {
@@ -834,7 +816,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
}
multiclass MUBUF_Pseudo_Atomics <string opName,
- RegisterClass vdataClass,
+ RegisterOperand vdataClass,
ValueType vdataType,
SDPatternOperator atomic = null_frag> :
MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
@@ -1029,87 +1011,87 @@ defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>;
}
defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_swap", VGPR_32, i32
+ "buffer_atomic_swap", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_cmpswap", VReg_64, v2i32
+ "buffer_atomic_cmpswap", AVLdSt_64, v2i32
>;
defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
- "buffer_atomic_add", VGPR_32, i32
+ "buffer_atomic_add", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_sub", VGPR_32, i32
+ "buffer_atomic_sub", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smin", VGPR_32, i32
+ "buffer_atomic_smin", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umin", VGPR_32, i32
+ "buffer_atomic_umin", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smax", VGPR_32, i32
+ "buffer_atomic_smax", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umax", VGPR_32, i32
+ "buffer_atomic_umax", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
- "buffer_atomic_and", VGPR_32, i32
+ "buffer_atomic_and", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
- "buffer_atomic_or", VGPR_32, i32
+ "buffer_atomic_or", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
- "buffer_atomic_xor", VGPR_32, i32
+ "buffer_atomic_xor", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
- "buffer_atomic_inc", VGPR_32, i32
+ "buffer_atomic_inc", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
- "buffer_atomic_dec", VGPR_32, i32
+ "buffer_atomic_dec", AVLdSt_32, i32
>;
defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_swap_x2", VReg_64, i64
+ "buffer_atomic_swap_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_cmpswap_x2", VReg_128, v2i64
+ "buffer_atomic_cmpswap_x2", AVLdSt_128, v2i64
>;
defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_add_x2", VReg_64, i64
+ "buffer_atomic_add_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_sub_x2", VReg_64, i64
+ "buffer_atomic_sub_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smin_x2", VReg_64, i64
+ "buffer_atomic_smin_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umin_x2", VReg_64, i64
+ "buffer_atomic_umin_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smax_x2", VReg_64, i64
+ "buffer_atomic_smax_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umax_x2", VReg_64, i64
+ "buffer_atomic_umax_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_and_x2", VReg_64, i64
+ "buffer_atomic_and_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_or_x2", VReg_64, i64
+ "buffer_atomic_or_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_xor_x2", VReg_64, i64
+ "buffer_atomic_xor_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_inc_x2", VReg_64, i64
+ "buffer_atomic_inc_x2", AVLdSt_64, i64
>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_dec_x2", VReg_64, i64
+ "buffer_atomic_dec_x2", AVLdSt_64, i64
>;
let OtherPredicates = [HasGFX10_BEncoding] in {
defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub
+ "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub
>;
}
@@ -1130,22 +1112,22 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
+ "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag
>;
}
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmin", VGPR_32, f32, null_frag
+ "buffer_atomic_fmin", AVLdSt_32, f32, null_frag
>;
defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmax", VGPR_32, f32, null_frag
+ "buffer_atomic_fmax", AVLdSt_32, f32, null_frag
>;
}
let SubtargetPredicate = isGFX6GFX7GFX10 in {
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
+ "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag
>;
}
@@ -1204,34 +1186,34 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <
let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<
- "buffer_atomic_add_f32", VGPR_32, f32
+ "buffer_atomic_add_f32", AVLdSt_32, f32
>;
let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
- "buffer_atomic_pk_add_f16", VGPR_32, v2f16
+ "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16
>;
let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
- "buffer_atomic_add_f32", VGPR_32, f32, null_frag
+ "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag
>;
let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
- "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag
+ "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag
>;
let SubtargetPredicate = isGFX12Plus in {
defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_cond_sub_u32", VGPR_32, i32
+ "buffer_atomic_cond_sub_u32", VGPROp_32, i32
>;
}
let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
let FPAtomic = 1 in
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16
+ "buffer_atomic_pk_add_bf16", AVLdSt_32, v2bf16
>;
}
@@ -1239,39 +1221,39 @@ defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
// MTBUF Instructions
//===----------------------------------------------------------------------===//
let OtherPredicates = [HasMTBUFInsts] in {
-defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32, 1>;
-defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64, 2>;
-defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96, 3>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128, 4>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32, 1>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64, 2>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96, 3>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>;
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", AVLdSt_32, 1>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", AVLdSt_64, 2>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", AVLdSt_96, 3>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", AVLdSt_128, 4>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", AVLdSt_32, 1>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", AVLdSt_64, 2>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", AVLdSt_96, 3>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", AVLdSt_128, 4>;
let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
let TiedSourceNotRead = 1 in {
- defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>;
- defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>;
-}
- defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>;
- defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>;
- defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>;
- defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>;
+ defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", AVLdSt_32, 1>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", AVLdSt_64, 2>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", AVLdSt_96, 3>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", AVLdSt_128, 4>;
+}
+ defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", AVLdSt_32, 1>;
+ defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", AVLdSt_64, 2>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", AVLdSt_96, 3>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", AVLdSt_128, 4>;
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
let TiedSourceNotRead = 1 in {
- defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>;
- defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>;
-}
- defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>;
- defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>;
- defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>;
- defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>;
+ defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", AVLdSt_32, 1>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", AVLdSt_32, 2>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", AVLdSt_64, 3>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", AVLdSt_64, 4>;
+}
+ defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", AVLdSt_32, 1>;
+ defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", AVLdSt_32, 2>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", AVLdSt_64, 3>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", AVLdSt_64, 4>;
} // End HasPackedD16VMem.
} // End HasMTBUFInsts.
@@ -1300,14 +1282,14 @@ let SubtargetPredicate = isGFX90APlus in {
} // End SubtargetPredicate = isGFX90APlus
let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
- defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
+ defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", AVLdSt_64, f64>;
} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
// Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
// depending on some subtargets.
- defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
- defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
+ defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", AVLdSt_64, f64>;
+ defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", AVLdSt_64, f64>;
}
def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
@@ -2414,7 +2396,6 @@ class VBUFFER_Real <bits<8> op, BUF_Pseudo ps, string real_name> :
let AsmMatchConverter = ps.AsmMatchConverter;
let OtherPredicates = ps.OtherPredicates;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let SchedRW = ps.SchedRW;
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130e..aae56eef73ed 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -6,7 +6,10 @@ tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler
+ --specialize-decoders-per-bitwidth
+ -ignore-non-decodable-operands
+ -ignore-fully-defined-operands)
tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
@@ -71,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUImageIntrinsicOptimizer.cpp
AMDGPULibFunc.cpp
AMDGPULowerBufferFatPointers.cpp
+ AMDGPULowerIntrinsics.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
@@ -82,6 +86,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMCInstLower.cpp
AMDGPUMemoryUtils.cpp
AMDGPUIGroupLP.cpp
+ AMDGPULowerVGPREncoding.cpp
AMDGPUMCResourceInfo.cpp
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 3ff675d6e5e9..f2e432fa8d7f 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -8,7 +8,7 @@
class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> :
InstSI <outs, ins, "", pattern>,
- SIMCInstr <opName, SIEncodingFamily.NONE> {
+ SIMCInstr <NAME, SIEncodingFamily.NONE> {
let LGKM_CNT = 1;
let DS = 1;
@@ -19,6 +19,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
// Most instruction load and store data, so set this as the default.
let mayLoad = 1;
let mayStore = 1;
+ let FixedSize = true;
let hasSideEffects = 0;
let SchedRW = [WriteLDS];
@@ -76,7 +77,6 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
let isConvergent = ps.isConvergent;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
// encoding fields
bits<10> vdst;
@@ -91,16 +91,33 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
let offset0 = !if(ps.has_offset, offset{7-0}, ?);
let offset1 = !if(ps.has_offset, offset{15-8}, ?);
- bits<1> acc = !if(ps.has_vdst, vdst{9},
- !if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0));
+ // Figure out if we should set the acc bit. Simple load and store
+ // instructions with a single data operand can use AV_* classes, in
+ // which case the encoding comes from the assigned register field.
+
+ // For more compliated cases with multiple data operands, since the
+ // register fields are only 8-bit, so data operands must all be AGPR
+ // or VGPR.
+ defvar DstOpIsAV = !if(ps.has_vdst,
+ VDstOperandIsAV<ps.OutOperandList>.ret, 0);
+ defvar DstOpIsAGPR = !if(ps.has_vdst,
+ VDstOperandIsAGPR<ps.OutOperandList>.ret, 0);
+ defvar DataOpIsAV = !if(!or(ps.has_data0, ps.has_gws_data0),
+ Data0OperandIsAV<ps.InOperandList>.ret, 0);
+ defvar DataOpIsAGPR = !if(!or(ps.has_data0, ps.has_gws_data0),
+ Data0OperandIsAGPR<ps.InOperandList>.ret, 0);
+
+ bits<1> acc = !if(ps.has_vdst,
+ !if(DstOpIsAV, vdst{9}, DstOpIsAGPR),
+ !if(DataOpIsAV, data0{9}, DataOpIsAGPR));
}
// DS Pseudo instructions
-class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+class DS_0A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32>
: DS_Pseudo<opName,
(outs),
- (ins getLdStRegisterOperand<rc>.ret:$data0, Offset:$offset, gds:$gds),
+ (ins rc:$data0, Offset:$offset, gds:$gds),
" $data0$offset$gds"> {
let has_addr = 0;
@@ -108,10 +125,10 @@ class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
let has_vdst = 0;
}
-class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+class DS_1A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, getLdStRegisterOperand<rc>.ret:$data0, Offset:$offset, gds:$gds),
+ (ins VGPR_32:$addr, rc:$data0, Offset:$offset, gds:$gds),
" $addr, $data0$offset$gds"> {
let has_data1 = 0;
@@ -119,7 +136,7 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
let IsAtomicNoRet = 1;
}
-multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A1D_NORET_mc<string opName, RegisterOperand rc = AVLdSt_32> {
def "" : DS_1A1D_NORET<opName, rc>;
let has_m0_read = 0 in {
@@ -127,23 +144,23 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
}
}
-multiclass DS_1A1D_NORET_t16<string opName, RegisterClass rc = VGPR_32>
+multiclass DS_1A1D_NORET_t16<string opName, RegisterOperand rc = AVLdSt_32>
: DS_1A1D_NORET_mc<opName, rc> {
let has_m0_read = 0 in {
let True16Predicate = UseRealTrue16Insts in {
- def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+ def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPROp_16>,
+ True16D16Table<NAME#"_D16_HI", NAME#"_gfx9">;
}
}
}
-multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterOperand rc = AVLdSt_32> {
let has_m0_read = 0 in {
def "" : DS_1A1D_NORET<opName, rc>;
}
}
-class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32,
- RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
+class DS_1A2D_NORET<string opName, RegisterOperand data_op = VGPROp_32>
: DS_Pseudo<opName,
(outs),
(ins VGPR_32:$addr, data_op:$data0, data_op:$data1, Offset:$offset, gds:$gds),
@@ -153,16 +170,24 @@ class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32,
let IsAtomicNoRet = 1;
}
-multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
+// DS_xx2D cases should only be instantiated with VGPR operand classes.
+multiclass DS_1A2D_NORET_mc<string opName, RegisterOperand rc = VGPROp_32> {
+ assert OperandIsVGPR<rc>.ret,
+ "DS with 2 data operands should be declared with VGPRs";
+
def "" : DS_1A2D_NORET<opName, rc>;
let has_m0_read = 0 in {
def _gfx9 : DS_1A2D_NORET<opName, rc>;
+
+ // All data operands are replaced with AGPRs in this form.
+ let SubtargetPredicate = isGFX90APlus in {
+ def _agpr : DS_1A2D_NORET<opName, getEquivalentAGPROperand<rc>.ret>;
+ }
}
}
-class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32,
- RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
+class DS_1A2D_Off8_NORET <string opName, RegisterOperand data_op = VGPROp_32>
: DS_Pseudo<opName,
(outs),
(ins VGPR_32:$addr, data_op:$data0, data_op:$data1,
@@ -173,17 +198,23 @@ class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32,
let has_offset = 0;
}
-multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterOperand rc = VGPROp_32> {
+ assert OperandIsVGPR<rc>.ret,
+ "DS with 2 data operands should be declared with VGPRs";
+
def "" : DS_1A2D_Off8_NORET<opName, rc>;
let has_m0_read = 0 in {
def _gfx9 : DS_1A2D_Off8_NORET<opName, rc>;
+
+ let SubtargetPredicate = isGFX90APlus in {
+ def _agpr : DS_1A2D_Off8_NORET<opName, getEquivalentAGPROperand<rc>.ret>;
+ }
}
}
-class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass src = rc,
- RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
- RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
+class DS_0A1D_RET_GDS<string opName, RegisterOperand dst_op = AVLdSt_32,
+ RegisterOperand src_op = dst_op>
: DS_Pseudo<opName,
(outs dst_op:$vdst),
(ins src_op:$data0, Offset:$offset),
@@ -196,8 +227,7 @@ class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass s
let hasSideEffects = 1;
}
-class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
- RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
+class DS_1A1D_RET <string opName, RegisterOperand data_op = AVLdSt_32>
: DS_Pseudo<opName,
(outs data_op:$vdst),
(ins VGPR_32:$addr, data_op:$data0, Offset:$offset, gds:$gds),
@@ -207,76 +237,84 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
let IsAtomicRet = 1;
}
-multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A1D_RET_mc <string opName, RegisterOperand rc = VGPROp_32> {
+ assert OperandIsVGPR<rc>.ret,
+ "DS with 2 data operands should be declared with VGPRs";
+
def "" : DS_1A1D_RET<opName, rc>;
let has_m0_read = 0 in {
def _gfx9 : DS_1A1D_RET<opName, rc>;
+ def _agpr : DS_1A1D_RET<opName, getEquivalentAGPROperand<rc>.ret>;
}
}
-multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterOperand rc = VGPROp_32> {
let has_m0_read = 0 in {
def "" : DS_1A1D_RET<opName, rc>;
+ def _agpr : DS_1A1D_RET<opName, getEquivalentAGPROperand<rc>.ret>;
}
}
class DS_1A2D_RET<string opName,
- RegisterClass rc = VGPR_32,
- RegisterClass src = rc,
- RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
- RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
-: DS_Pseudo<opName,
- (outs dst_op:$vdst),
- (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, Offset:$offset, gds:$gds),
+ RegisterOperand dst_rc = VGPROp_32,
+ RegisterOperand src_rc = dst_rc>: DS_Pseudo<opName,
+ (outs dst_rc:$vdst),
+ (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset:$offset, gds:$gds),
" $vdst, $addr, $data0, $data1$offset$gds"> {
let IsAtomicRet = 1;
}
multiclass DS_1A2D_RET_mc<string opName,
- RegisterClass rc = VGPR_32,
- RegisterClass src = rc> {
- def "" : DS_1A2D_RET<opName, rc, src>;
+ RegisterOperand dst_rc = VGPROp_32,
+ RegisterOperand src_rc = dst_rc> {
+ assert !and(OperandIsVGPR<dst_rc>.ret, OperandIsVGPR<src_rc>.ret),
+ "DS with 2 data operands should be declared with VGPRs";
+
+ def "" : DS_1A2D_RET<opName, dst_rc, src_rc>;
let has_m0_read = 0 in {
- def _gfx9 : DS_1A2D_RET<opName, rc, src>;
+ def _gfx9 : DS_1A2D_RET<opName, dst_rc, src_rc>;
+ def _agpr : DS_1A2D_RET<opName, getEquivalentAGPROperand<dst_rc>.ret,
+ getEquivalentAGPROperand<src_rc>.ret>;
}
}
class DS_1A2D_Off8_RET<string opName,
- RegisterClass rc = VGPR_32,
- RegisterClass src = rc,
- RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
- RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
+ RegisterOperand dst_rc = VGPROp_32,
+ RegisterOperand src_rc = dst_rc>
: DS_Pseudo<opName,
- (outs dst_op:$vdst),
- (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds),
+ (outs dst_rc:$vdst),
+ (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds),
" $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
let has_offset = 0;
}
multiclass DS_1A2D_Off8_RET_mc<string opName,
- RegisterClass rc = VGPR_32,
- RegisterClass src = rc> {
- def "" : DS_1A2D_Off8_RET<opName, rc, src>;
+ RegisterOperand dst_rc = VGPROp_32,
+ RegisterOperand src_rc = dst_rc> {
+ assert !and(OperandIsVGPR<dst_rc>.ret, OperandIsVGPR<src_rc>.ret) ,
+ "DS with 2 data operands should be declared with VGPRs";
+
+ def "" : DS_1A2D_Off8_RET<opName, dst_rc, src_rc>;
let has_m0_read = 0 in {
- def _gfx9 : DS_1A2D_Off8_RET<opName, rc, src>;
+ def _gfx9 : DS_1A2D_Off8_RET<opName, dst_rc, src_rc>;
+ def _agpr : DS_1A2D_Off8_RET<opName, getEquivalentAGPROperand<dst_rc>.ret,
+ getEquivalentAGPROperand<src_rc>.ret>;
}
}
class DS_BVH_STACK<string opName,
- RegisterClass vdst_rc,
- RegisterClass data1_rc>
+ RegisterOperand vdst_rc,
+ RegisterOperand data1_rc>
: DS_Pseudo<opName,
- (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst, VGPR_32:$addr),
- (ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0,
- data1_rc:$data1, Offset:$offset),
+ (outs vdst_rc:$vdst, VGPR_32:$addr),
+ (ins VGPR_32:$addr_in, VGPR_32:$data0, data1_rc:$data1, Offset:$offset),
" $vdst, $addr, $data0, $data1$offset"> {
let Constraints = "$addr = $addr_in";
- let DisableEncoding = "$addr_in";
let has_gds = 0;
let gdsValue = 0;
// TODO: Use MMOs in the LDS address space instead of hasSideEffects = 1.
@@ -284,8 +322,8 @@ class DS_BVH_STACK<string opName,
let SchedRW = [WriteLDS, WriteLDS];
}
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset,
- RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
+class DS_1A_RET<string opName, RegisterOperand data_op = AVLdSt_32,
+ bit HasTiedOutput = 0, Operand ofs = Offset>
: DS_Pseudo<opName,
(outs data_op:$vdst),
!if(HasTiedOutput,
@@ -293,12 +331,12 @@ class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0
(ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
" $vdst, $addr$offset$gds"> {
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
- let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
let has_data0 = 0;
let has_data1 = 0;
}
-multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset> {
+multiclass DS_1A_RET_mc<string opName, RegisterOperand rc = AVLdSt_32,
+ bit HasTiedOutput = 0, Operand ofs = Offset> {
def "" : DS_1A_RET<opName, rc, HasTiedOutput, ofs>;
let has_m0_read = 0 in {
@@ -306,27 +344,28 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
}
}
-multiclass DS_1A_RET_t16<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset>
+multiclass DS_1A_RET_t16<string opName, RegisterOperand rc = AVLdSt_32,
+ bit HasTiedOutput = 0, Operand ofs = Offset>
: DS_1A_RET_mc<opName, rc, HasTiedOutput, ofs> {
let has_m0_read = 0 in {
let True16Predicate = UseRealTrue16Insts in {
- def "_t16" : DS_1A_RET<opName#"_t16", VGPR_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">;
+ def "_t16" : DS_1A_RET<opName#"_t16", VGPROp_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">;
}
}
}
-multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A_RET_NoM0<string opName, RegisterOperand rc = VGPROp_32> {
let has_m0_read = 0 in {
def "" : DS_1A_RET<opName, rc>;
}
}
-class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
+class DS_1A_RET_Tied<string opName, RegisterOperand rc = AVLdSt_32> :
DS_1A_RET<opName, rc, 1>;
-class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
+class DS_1A_Off8_RET <string opName, RegisterOperand rc = AVLdSt_32>
: DS_Pseudo<opName,
- (outs getLdStRegisterOperand<rc>.ret:$vdst),
+ (outs rc:$vdst),
(ins VGPR_32:$addr, Offset0:$offset0, Offset1:$offset1, gds:$gds),
" $vdst, $addr$offset0$offset1$gds"> {
@@ -335,7 +374,7 @@ class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
let has_data1 = 0;
}
-multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A_Off8_RET_mc <string opName, RegisterOperand rc = VGPROp_32> {
def "" : DS_1A_Off8_RET<opName, rc>;
let has_m0_read = 0 in {
@@ -344,7 +383,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
}
class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
- (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
+ (outs AVLdSt_32:$vdst),
(ins VGPR_32:$addr, Offset:$offset),
" $vdst, $addr$offset gds"> {
@@ -369,7 +408,7 @@ class DS_1A_Off16_NORET <string opName>
}
class DS_0A_RET <string opName> : DS_Pseudo<opName,
- (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
+ (outs AVLdSt_32:$vdst),
(ins Offset:$offset, gds:$gds),
" $vdst$offset$gds"> {
@@ -424,7 +463,7 @@ class DS_GWS_0D <string opName>
class DS_GWS_1D <string opName>
: DS_GWS<opName,
- (ins getLdStRegisterOperand<VGPR_32>.ret:$data0, Offset:$offset),
+ (ins AVLdSt_32:$data0, Offset:$offset),
" $data0$offset gds"> {
let has_gws_data0 = 1;
@@ -449,7 +488,7 @@ class DS_VOID <string opName> : DS_Pseudo<opName,
}
class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
- RegisterOperand data_op = getLdStRegisterOperand<VGPR_32>.ret>
+ RegisterOperand data_op = AVLdSt_32>
: DS_Pseudo<opName,
(outs data_op:$vdst),
(ins VGPR_32:$addr, data_op:$data0, Offset:$offset),
@@ -465,12 +504,75 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
let has_gds = 0;
}
+multiclass DS_1A1D_PERMUTE_mc <string opName, SDPatternOperator node = null_frag,
+ RegisterOperand data_op = VGPROp_32> {
+ assert OperandIsVGPR<data_op>.ret,
+ "DS with 2 data operands should be declared with VGPRs";
+ def "" : DS_1A1D_PERMUTE<opName, node, data_op>;
+
+ let SubtargetPredicate = isGFX90APlus in {
+ def _agpr : DS_1A1D_PERMUTE<opName, null_frag,
+ getEquivalentAGPROperand<data_op>.ret>;
+ }
+}
+
+
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> {
let AddedComplexity = complexity;
}
+multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_"#vt)>;
+ }
+
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
+ /* complexity */ 0, /* gds */ 1>;
+ }
+}
+
+multiclass DSAtomicRetNoRetPat_NoM0_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_"#vt)>;
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
+}
+
+multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ defm : DSAtomicRetNoRetPat_NoM0_mc<
+ !cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"),
+ !cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"),
+ vt, frag>;
+ }
+
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_region_m0_"#vt),
+ /* complexity */ 0, /* gds */ 1>;
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
+ /* complexity */ 1, /* gds */ 1>;
+ }
+}
+
defm DS_ADD_U32 : DS_1A1D_NORET_mc<"ds_add_u32">;
defm DS_SUB_U32 : DS_1A1D_NORET_mc<"ds_sub_u32">;
defm DS_RSUB_U32 : DS_1A1D_NORET_mc<"ds_rsub_u32">;
@@ -516,100 +618,100 @@ def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
} // End mayLoad = 0
let SubtargetPredicate = HasLdsAtomicAddF64 in {
- defm DS_ADD_F64 : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", VReg_64>;
- defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64>;
+ defm DS_ADD_F64 : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", AVLdSt_64>;
+ defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VGPROp_64>;
} // End SubtargetPredicate = HasLdsAtomicAddF64
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
- defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc<"ds_pk_add_f16">;
- defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_f16", VGPR_32>;
- defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc<"ds_pk_add_bf16">;
- defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_bf16", VGPR_32>;
+ defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">;
+ defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16">;
+ defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">;
+ defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16">;
} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">;
defm DS_CMPSTORE_F32 : DS_1A2D_NORET_mc<"ds_cmpstore_f32">;
-defm DS_CMPSTORE_B64 : DS_1A2D_NORET_mc<"ds_cmpstore_b64", VReg_64>;
-defm DS_CMPSTORE_F64 : DS_1A2D_NORET_mc<"ds_cmpstore_f64", VReg_64>;
-defm DS_CMPSTORE_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b32", VGPR_32>;
-defm DS_CMPSTORE_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f32", VGPR_32>;
-defm DS_CMPSTORE_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b64", VReg_64>;
-defm DS_CMPSTORE_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f64", VReg_64>;
+defm DS_CMPSTORE_B64 : DS_1A2D_NORET_mc<"ds_cmpstore_b64", VGPROp_64>;
+defm DS_CMPSTORE_F64 : DS_1A2D_NORET_mc<"ds_cmpstore_f64", VGPROp_64>;
+defm DS_CMPSTORE_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b32">;
+defm DS_CMPSTORE_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f32">;
+defm DS_CMPSTORE_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b64", VGPROp_64>;
+defm DS_CMPSTORE_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f64", VGPROp_64>;
defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">;
defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
-defm DS_ADD_U64 : DS_1A1D_NORET_mc<"ds_add_u64", VReg_64>;
-defm DS_SUB_U64 : DS_1A1D_NORET_mc<"ds_sub_u64", VReg_64>;
-defm DS_RSUB_U64 : DS_1A1D_NORET_mc<"ds_rsub_u64", VReg_64>;
-defm DS_INC_U64 : DS_1A1D_NORET_mc<"ds_inc_u64", VReg_64>;
-defm DS_DEC_U64 : DS_1A1D_NORET_mc<"ds_dec_u64", VReg_64>;
-defm DS_MIN_I64 : DS_1A1D_NORET_mc<"ds_min_i64", VReg_64>;
-defm DS_MAX_I64 : DS_1A1D_NORET_mc<"ds_max_i64", VReg_64>;
-defm DS_MIN_U64 : DS_1A1D_NORET_mc<"ds_min_u64", VReg_64>;
-defm DS_MAX_U64 : DS_1A1D_NORET_mc<"ds_max_u64", VReg_64>;
-defm DS_AND_B64 : DS_1A1D_NORET_mc<"ds_and_b64", VReg_64>;
-defm DS_OR_B64 : DS_1A1D_NORET_mc<"ds_or_b64", VReg_64>;
-defm DS_XOR_B64 : DS_1A1D_NORET_mc<"ds_xor_b64", VReg_64>;
-defm DS_MSKOR_B64 : DS_1A2D_NORET_mc<"ds_mskor_b64", VReg_64>;
+defm DS_ADD_U64 : DS_1A1D_NORET_mc<"ds_add_u64", AVLdSt_64>;
+defm DS_SUB_U64 : DS_1A1D_NORET_mc<"ds_sub_u64", AVLdSt_64>;
+defm DS_RSUB_U64 : DS_1A1D_NORET_mc<"ds_rsub_u64", AVLdSt_64>;
+defm DS_INC_U64 : DS_1A1D_NORET_mc<"ds_inc_u64", AVLdSt_64>;
+defm DS_DEC_U64 : DS_1A1D_NORET_mc<"ds_dec_u64", AVLdSt_64>;
+defm DS_MIN_I64 : DS_1A1D_NORET_mc<"ds_min_i64", AVLdSt_64>;
+defm DS_MAX_I64 : DS_1A1D_NORET_mc<"ds_max_i64", AVLdSt_64>;
+defm DS_MIN_U64 : DS_1A1D_NORET_mc<"ds_min_u64", AVLdSt_64>;
+defm DS_MAX_U64 : DS_1A1D_NORET_mc<"ds_max_u64", AVLdSt_64>;
+defm DS_AND_B64 : DS_1A1D_NORET_mc<"ds_and_b64", AVLdSt_64>;
+defm DS_OR_B64 : DS_1A1D_NORET_mc<"ds_or_b64", AVLdSt_64>;
+defm DS_XOR_B64 : DS_1A1D_NORET_mc<"ds_xor_b64", AVLdSt_64>;
+defm DS_MSKOR_B64 : DS_1A2D_NORET_mc<"ds_mskor_b64", VGPROp_64>;
let mayLoad = 0 in {
-defm DS_WRITE_B64 : DS_1A1D_NORET_mc<"ds_write_b64", VReg_64>;
-defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET_mc<"ds_write2_b64", VReg_64>;
-defm DS_WRITE2ST64_B64: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b64", VReg_64>;
+defm DS_WRITE_B64 : DS_1A1D_NORET_mc<"ds_write_b64", AVLdSt_64>;
+defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET_mc<"ds_write2_b64", VGPROp_64>;
+defm DS_WRITE2ST64_B64: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b64", VGPROp_64>;
}
-defm DS_CMPST_B64 : DS_1A2D_NORET_mc<"ds_cmpst_b64", VReg_64>;
-defm DS_CMPST_F64 : DS_1A2D_NORET_mc<"ds_cmpst_f64", VReg_64>;
-defm DS_MIN_F64 : DS_1A1D_NORET_mc<"ds_min_f64", VReg_64>;
-defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>;
+defm DS_CMPST_B64 : DS_1A2D_NORET_mc<"ds_cmpst_b64", VGPROp_64>;
+defm DS_CMPST_F64 : DS_1A2D_NORET_mc<"ds_cmpst_f64", VGPROp_64>;
+defm DS_MIN_F64 : DS_1A1D_NORET_mc<"ds_min_f64", AVLdSt_64>;
+defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", AVLdSt_64>;
-defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32>;
+defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32">;
let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
-defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32>;
-}
-defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32>;
-defm DS_RSUB_RTN_U32 : DS_1A1D_RET_mc<"ds_rsub_rtn_u32", VGPR_32>;
-defm DS_INC_RTN_U32 : DS_1A1D_RET_mc<"ds_inc_rtn_u32", VGPR_32>;
-defm DS_DEC_RTN_U32 : DS_1A1D_RET_mc<"ds_dec_rtn_u32", VGPR_32>;
-defm DS_MIN_RTN_I32 : DS_1A1D_RET_mc<"ds_min_rtn_i32", VGPR_32>;
-defm DS_MAX_RTN_I32 : DS_1A1D_RET_mc<"ds_max_rtn_i32", VGPR_32>;
-defm DS_MIN_RTN_U32 : DS_1A1D_RET_mc<"ds_min_rtn_u32", VGPR_32>;
-defm DS_MAX_RTN_U32 : DS_1A1D_RET_mc<"ds_max_rtn_u32", VGPR_32>;
-defm DS_AND_RTN_B32 : DS_1A1D_RET_mc<"ds_and_rtn_b32", VGPR_32>;
-defm DS_OR_RTN_B32 : DS_1A1D_RET_mc<"ds_or_rtn_b32", VGPR_32>;
-defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32>;
-defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32>;
-defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32>;
-defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32>;
-defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32>;
-defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32>;
+defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32">;
+}
+defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32">;
+defm DS_RSUB_RTN_U32 : DS_1A1D_RET_mc<"ds_rsub_rtn_u32">;
+defm DS_INC_RTN_U32 : DS_1A1D_RET_mc<"ds_inc_rtn_u32">;
+defm DS_DEC_RTN_U32 : DS_1A1D_RET_mc<"ds_dec_rtn_u32">;
+defm DS_MIN_RTN_I32 : DS_1A1D_RET_mc<"ds_min_rtn_i32">;
+defm DS_MAX_RTN_I32 : DS_1A1D_RET_mc<"ds_max_rtn_i32">;
+defm DS_MIN_RTN_U32 : DS_1A1D_RET_mc<"ds_min_rtn_u32">;
+defm DS_MAX_RTN_U32 : DS_1A1D_RET_mc<"ds_max_rtn_u32">;
+defm DS_AND_RTN_B32 : DS_1A1D_RET_mc<"ds_and_rtn_b32">;
+defm DS_OR_RTN_B32 : DS_1A1D_RET_mc<"ds_or_rtn_b32">;
+defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32">;
+defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPROp_32>;
+defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPROp_32>;
+defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPROp_32>;
+defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32">;
+defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32">;
defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">;
-defm DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>;
-defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>;
-
-defm DS_ADD_RTN_U64 : DS_1A1D_RET_mc<"ds_add_rtn_u64", VReg_64>;
-defm DS_SUB_RTN_U64 : DS_1A1D_RET_mc<"ds_sub_rtn_u64", VReg_64>;
-defm DS_RSUB_RTN_U64 : DS_1A1D_RET_mc<"ds_rsub_rtn_u64", VReg_64>;
-defm DS_INC_RTN_U64 : DS_1A1D_RET_mc<"ds_inc_rtn_u64", VReg_64>;
-defm DS_DEC_RTN_U64 : DS_1A1D_RET_mc<"ds_dec_rtn_u64", VReg_64>;
-defm DS_MIN_RTN_I64 : DS_1A1D_RET_mc<"ds_min_rtn_i64", VReg_64>;
-defm DS_MAX_RTN_I64 : DS_1A1D_RET_mc<"ds_max_rtn_i64", VReg_64>;
-defm DS_MIN_RTN_U64 : DS_1A1D_RET_mc<"ds_min_rtn_u64", VReg_64>;
-defm DS_MAX_RTN_U64 : DS_1A1D_RET_mc<"ds_max_rtn_u64", VReg_64>;
-defm DS_AND_RTN_B64 : DS_1A1D_RET_mc<"ds_and_rtn_b64", VReg_64>;
-defm DS_OR_RTN_B64 : DS_1A1D_RET_mc<"ds_or_rtn_b64", VReg_64>;
-defm DS_XOR_RTN_B64 : DS_1A1D_RET_mc<"ds_xor_rtn_b64", VReg_64>;
-defm DS_MSKOR_RTN_B64 : DS_1A2D_RET_mc<"ds_mskor_rtn_b64", VReg_64>;
-defm DS_CMPST_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b64", VReg_64>;
-defm DS_CMPST_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f64", VReg_64>;
-defm DS_MIN_RTN_F64 : DS_1A1D_RET_mc<"ds_min_rtn_f64", VReg_64>;
-defm DS_MAX_RTN_F64 : DS_1A1D_RET_mc<"ds_max_rtn_f64", VReg_64>;
-
-defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
-defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
-defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
+defm DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b32", VGPROp_64, VGPROp_32>;
+defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b32", VGPROp_64, VGPROp_32>;
+
+defm DS_ADD_RTN_U64 : DS_1A1D_RET_mc<"ds_add_rtn_u64", VGPROp_64>;
+defm DS_SUB_RTN_U64 : DS_1A1D_RET_mc<"ds_sub_rtn_u64", VGPROp_64>;
+defm DS_RSUB_RTN_U64 : DS_1A1D_RET_mc<"ds_rsub_rtn_u64", VGPROp_64>;
+defm DS_INC_RTN_U64 : DS_1A1D_RET_mc<"ds_inc_rtn_u64", VGPROp_64>;
+defm DS_DEC_RTN_U64 : DS_1A1D_RET_mc<"ds_dec_rtn_u64", VGPROp_64>;
+defm DS_MIN_RTN_I64 : DS_1A1D_RET_mc<"ds_min_rtn_i64", VGPROp_64>;
+defm DS_MAX_RTN_I64 : DS_1A1D_RET_mc<"ds_max_rtn_i64", VGPROp_64>;
+defm DS_MIN_RTN_U64 : DS_1A1D_RET_mc<"ds_min_rtn_u64", VGPROp_64>;
+defm DS_MAX_RTN_U64 : DS_1A1D_RET_mc<"ds_max_rtn_u64", VGPROp_64>;
+defm DS_AND_RTN_B64 : DS_1A1D_RET_mc<"ds_and_rtn_b64", VGPROp_64>;
+defm DS_OR_RTN_B64 : DS_1A1D_RET_mc<"ds_or_rtn_b64", VGPROp_64>;
+defm DS_XOR_RTN_B64 : DS_1A1D_RET_mc<"ds_xor_rtn_b64", VGPROp_64>;
+defm DS_MSKOR_RTN_B64 : DS_1A2D_RET_mc<"ds_mskor_rtn_b64", VGPROp_64>;
+defm DS_CMPST_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b64", VGPROp_64>;
+defm DS_CMPST_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f64", VGPROp_64>;
+defm DS_MIN_RTN_F64 : DS_1A1D_RET_mc<"ds_min_rtn_f64", VGPROp_64>;
+defm DS_MAX_RTN_F64 : DS_1A1D_RET_mc<"ds_max_rtn_f64", VGPROp_64>;
+
+defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VGPROp_64>;
+defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VGPROp_128, VGPROp_64>;
+defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VGPROp_128, VGPROp_64>;
let isConvergent = 1, usesCustomInserter = 1 in {
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
@@ -657,19 +759,19 @@ def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
} // End SubtargetPredicate = HasDsSrc2Insts
let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", AVLdSt_32, 0, Swizzle>;
}
let mayStore = 0 in {
defm DS_READ_I16 : DS_1A_RET_mc<"ds_read_i16">;
defm DS_READ_B32 : DS_1A_RET_mc<"ds_read_b32">;
-defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", VReg_64>;
+defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", AVLdSt_64>;
-defm DS_READ2_B32 : DS_1A_Off8_RET_mc<"ds_read2_b32", VReg_64>;
-defm DS_READ2ST64_B32: DS_1A_Off8_RET_mc<"ds_read2st64_b32", VReg_64>;
+defm DS_READ2_B32 : DS_1A_Off8_RET_mc<"ds_read2_b32", AVLdSt_64>;
+defm DS_READ2ST64_B32: DS_1A_Off8_RET_mc<"ds_read2st64_b32", AVLdSt_64>;
-defm DS_READ2_B64 : DS_1A_Off8_RET_mc<"ds_read2_b64", VReg_128>;
-defm DS_READ2ST64_B64: DS_1A_Off8_RET_mc<"ds_read2st64_b64", VReg_128>;
+defm DS_READ2_B64 : DS_1A_Off8_RET_mc<"ds_read2_b64", AVLdSt_128>;
+defm DS_READ2ST64_B64: DS_1A_Off8_RET_mc<"ds_read2st64_b64", AVLdSt_128>;
let has_m0_read = 0 in {
let SubtargetPredicate = HasD16LoadStore, TiedSourceNotRead = 1 in {
@@ -704,21 +806,21 @@ def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
let SubtargetPredicate = isGFX7Plus in {
-defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>;
-defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>;
+defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPROp_32>;
+defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VGPROp_64>;
let isConvergent = 1, usesCustomInserter = 1 in {
def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">;
}
let mayStore = 0 in {
-defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>;
-defm DS_READ_B128: DS_1A_RET_mc<"ds_read_b128", VReg_128>;
+defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", AVLdSt_96>;
+defm DS_READ_B128: DS_1A_RET_mc<"ds_read_b128", AVLdSt_128>;
} // End mayStore = 0
let mayLoad = 0 in {
-defm DS_WRITE_B96 : DS_1A1D_NORET_mc<"ds_write_b96", VReg_96>;
-defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", VReg_128>;
+defm DS_WRITE_B96 : DS_1A1D_NORET_mc<"ds_write_b96", AVLdSt_96>;
+defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", AVLdSt_128>;
} // End mayLoad = 0
def DS_NOP : DS_VOID<"ds_nop">;
@@ -732,10 +834,10 @@ def DS_NOP : DS_VOID<"ds_nop">;
let SubtargetPredicate = isGFX8Plus in {
let Uses = [EXEC] in {
-def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32",
- int_amdgcn_ds_permute>;
-def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
- int_amdgcn_ds_bpermute>;
+defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_permute_b32",
+ int_amdgcn_ds_permute>;
+defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_bpermute_b32",
+ int_amdgcn_ds_bpermute>;
}
} // let SubtargetPredicate = isGFX8Plus
@@ -751,8 +853,8 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
let SubtargetPredicate = isGFX11Only in {
-def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>;
-def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
+def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VGPROp_64, VGPROp_32>;
+def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VGPROp_64, VGPROp_32>;
} // let SubtargetPredicate = isGFX11Only
@@ -760,7 +862,7 @@ let SubtargetPredicate = isGFX11Plus in {
let OtherPredicates = [HasImageInsts] in
def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32",
- VGPR_32, VReg_128> ;
+ VGPROp_32, VGPROp_128> ;
} // let SubtargetPredicate = isGFX11Plus
@@ -772,15 +874,15 @@ let SubtargetPredicate = isGFX12Plus in {
let OtherPredicates = [HasImageInsts] in {
def DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_BVH_STACK<
- "ds_bvh_stack_push8_pop1_rtn_b32", VGPR_32, VReg_256>;
+ "ds_bvh_stack_push8_pop1_rtn_b32", VGPROp_32, VGPROp_256>;
def DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_BVH_STACK<
- "ds_bvh_stack_push8_pop2_rtn_b64", VReg_64, VReg_256>;
+ "ds_bvh_stack_push8_pop2_rtn_b64", VGPROp_64, VGPROp_256>;
} // End OtherPredicates = [HasImageInsts].
-defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">;
-defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>;
-defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
-defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32>;
+defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc_gfx9<"ds_cond_sub_u32">;
+defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_cond_sub_rtn_u32", VGPROp_32>;
+defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc_gfx9<"ds_sub_clamp_u32">;
+defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_32>;
def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
int_amdgcn_ds_bpermute_fi_b32>;
@@ -801,11 +903,11 @@ let SubtargetPredicate = isGFX1250Plus in {
let WaveSizePredicate = isWave32, mayStore = 0 in {
let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
-defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VReg_64>;
-defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VReg_96>;
+defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VGPROp_64>;
+defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96>;
} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
-defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>;
-defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>;
+defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VGPROp_64>;
+defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VGPROp_128>;
} // End WaveSizePredicate = isWave32, mayStore = 0
let OtherPredicates = [HasLdsBarrierArriveAtomic] in {
@@ -818,7 +920,7 @@ def : GCNPat <
(DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPR_32:$ptr, Offset:$offset, (i1 0))
>;
-defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VReg_64>;
+defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VGPROp_64>;
def : GCNPat<
(i64 (int_amdgcn_ds_atomic_barrier_arrive_rtn_b64 (DS1Addr1Offset i32:$ptr, i32:$offset), i64:$data)),
@@ -829,10 +931,10 @@ def : GCNPat<
} // End SubtargetPredicate = isGFX1250Plus
let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
- defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
- defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>;
- defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", VReg_64>;
- defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", VReg_96>;
+ defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", AVLdSt_64>;
+ defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", AVLdSt_64>;
+ defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", AVLdSt_64>;
+ defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96>;
}
//===----------------------------------------------------------------------===//
@@ -984,6 +1086,7 @@ class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : G
(inst $ptr, $offset0, $offset1, (i1 0))
>;
+// TODO: Should this use AVLdSt_64 for the class?
class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
(frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i32:$offset0, i32:$offset1)),
(inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)),
@@ -1091,50 +1194,6 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
} // End AddedComplexity = 100
-multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
- let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>;
- }
-
- let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt)>;
- }
-
- let OtherPredicates = [HasGDS] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
- /* complexity */ 0, /* gds */ 1>;
- }
-}
-
-multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
- ValueType vt, string frag> {
- let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_m0_"#vt)>;
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
- }
-
- let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt)>;
- def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
- }
-
- let OtherPredicates = [HasGDS] in {
- def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_region_m0_"#vt),
- /* complexity */ 0, /* gds */ 1>;
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
- /* complexity */ 1, /* gds */ 1>;
- }
-}
-
-
-
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag,
@@ -1212,8 +1271,8 @@ defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
-defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
-defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
@@ -1265,7 +1324,7 @@ class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag,
} // End SubtargetPredicate = HasLdsAtomicAddF64
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
-defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
let OtherPredicates = [HasGDS] in
@@ -1357,8 +1416,10 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
// GFX12.
//===----------------------------------------------------------------------===//
-multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
- defvar ps = !cast<DS_Pseudo>(NAME);
+multiclass DS_Real_gfx12<bits<8> op,
+ DS_Pseudo ps = !cast<DS_Pseudo>(NAME),
+ string name = !tolower(NAME)> {
+
let AssemblerPredicate = isGFX12Plus in {
let DecoderNamespace = "GFX12" in
def _gfx12 :
@@ -1369,14 +1430,20 @@ multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
} // End AssemblerPredicate
}
-defm DS_MIN_F32 : DS_Real_gfx12<0x012, "ds_min_num_f32">;
-defm DS_MAX_F32 : DS_Real_gfx12<0x013, "ds_max_num_f32">;
-defm DS_MIN_RTN_F32 : DS_Real_gfx12<0x032, "ds_min_num_rtn_f32">;
-defm DS_MAX_RTN_F32 : DS_Real_gfx12<0x033, "ds_max_num_rtn_f32">;
-defm DS_MIN_F64 : DS_Real_gfx12<0x052, "ds_min_num_f64">;
-defm DS_MAX_F64 : DS_Real_gfx12<0x053, "ds_max_num_f64">;
-defm DS_MIN_RTN_F64 : DS_Real_gfx12<0x072, "ds_min_num_rtn_f64">;
-defm DS_MAX_RTN_F64 : DS_Real_gfx12<0x073, "ds_max_num_rtn_f64">;
+// Helper to avoid repeating the pseudo-name if we only need to set
+// the gfx12 name.
+multiclass DS_Real_gfx12_with_name<bits<8> op, string name> {
+ defm "" : DS_Real_gfx12<op, !cast<DS_Pseudo>(NAME#"_gfx9"), name>;
+}
+
+defm DS_MIN_F32 : DS_Real_gfx12_with_name<0x012, "ds_min_num_f32">;
+defm DS_MAX_F32 : DS_Real_gfx12_with_name<0x013, "ds_max_num_f32">;
+defm DS_MIN_RTN_F32 : DS_Real_gfx12_with_name<0x032, "ds_min_num_rtn_f32">;
+defm DS_MAX_RTN_F32 : DS_Real_gfx12_with_name<0x033, "ds_max_num_rtn_f32">;
+defm DS_MIN_F64 : DS_Real_gfx12_with_name<0x052, "ds_min_num_f64">;
+defm DS_MAX_F64 : DS_Real_gfx12_with_name<0x053, "ds_max_num_f64">;
+defm DS_MIN_RTN_F64 : DS_Real_gfx12_with_name<0x072, "ds_min_num_rtn_f64">;
+defm DS_MAX_RTN_F64 : DS_Real_gfx12_with_name<0x073, "ds_max_num_rtn_f64">;
defm DS_COND_SUB_U32 : DS_Real_gfx12<0x098>;
defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>;
defm DS_COND_SUB_RTN_U32 : DS_Real_gfx12<0x0a8>;
@@ -1392,8 +1459,8 @@ defm DS_LOAD_TR6_B96 : DS_Real_gfx12<0x0fb>;
defm DS_LOAD_TR16_B128 : DS_Real_gfx12<0x0fc>;
defm DS_LOAD_TR8_B64 : DS_Real_gfx12<0x0fd>;
-defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
- "ds_bvh_stack_push4_pop1_rtn_b32">;
+defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, DS_BVH_STACK_RTN_B32,
+ "ds_bvh_stack_push4_pop1_rtn_b32">;
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>;
@@ -1421,8 +1488,8 @@ def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250
// GFX11.
//===----------------------------------------------------------------------===//
-multiclass DS_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
- defvar ps = !cast<DS_Pseudo>(NAME);
+multiclass DS_Real_gfx11<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9"),
+ string name = !tolower(NAME)> {
let AssemblerPredicate = isGFX11Only in {
let DecoderNamespace = "GFX11" in
def _gfx11 :
@@ -1433,8 +1500,11 @@ multiclass DS_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
} // End AssemblerPredicate
}
-multiclass DS_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)>
- : DS_Real_gfx11<op, name>, DS_Real_gfx12<op, name>;
+multiclass DS_Real_gfx11_gfx12<bits<8> op,
+ string name = !tolower(NAME),
+ DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9")>
+ : DS_Real_gfx11<op, ps, name>,
+ DS_Real_gfx12<op, ps, name>;
defm DS_WRITE_B32 : DS_Real_gfx11_gfx12<0x00d, "ds_store_b32">;
defm DS_WRITE2_B32 : DS_Real_gfx11_gfx12<0x00e, "ds_store_2addr_b32">;
@@ -1460,16 +1530,16 @@ defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx11_gfx12<0x06f, "ds_storexchg_2addr_str
defm DS_READ_B64 : DS_Real_gfx11_gfx12<0x076, "ds_load_b64">;
defm DS_READ2_B64 : DS_Real_gfx11_gfx12<0x077, "ds_load_2addr_b64">;
defm DS_READ2ST64_B64 : DS_Real_gfx11_gfx12<0x078, "ds_load_2addr_stride64_b64">;
-defm DS_WRITE_B8_D16_HI : DS_Real_gfx11_gfx12<0x0a0, "ds_store_b8_d16_hi">;
-defm DS_WRITE_B16_D16_HI : DS_Real_gfx11_gfx12<0x0a1, "ds_store_b16_d16_hi">;
-defm DS_READ_U8_D16 : DS_Real_gfx11_gfx12<0x0a2, "ds_load_u8_d16">;
-defm DS_READ_U8_D16_HI : DS_Real_gfx11_gfx12<0x0a3, "ds_load_u8_d16_hi">;
-defm DS_READ_I8_D16 : DS_Real_gfx11_gfx12<0x0a4, "ds_load_i8_d16">;
-defm DS_READ_I8_D16_HI : DS_Real_gfx11_gfx12<0x0a5, "ds_load_i8_d16_hi">;
-defm DS_READ_U16_D16 : DS_Real_gfx11_gfx12<0x0a6, "ds_load_u16_d16">;
-defm DS_READ_U16_D16_HI : DS_Real_gfx11_gfx12<0x0a7, "ds_load_u16_d16_hi">;
-defm DS_WRITE_ADDTID_B32 : DS_Real_gfx11_gfx12<0x0b0, "ds_store_addtid_b32">;
-defm DS_READ_ADDTID_B32 : DS_Real_gfx11_gfx12<0x0b1, "ds_load_addtid_b32">;
+defm DS_WRITE_B8_D16_HI : DS_Real_gfx11_gfx12<0x0a0, "ds_store_b8_d16_hi", DS_WRITE_B8_D16_HI>;
+defm DS_WRITE_B16_D16_HI : DS_Real_gfx11_gfx12<0x0a1, "ds_store_b16_d16_hi", DS_WRITE_B16_D16_HI>;
+defm DS_READ_U8_D16 : DS_Real_gfx11_gfx12<0x0a2, "ds_load_u8_d16", DS_READ_U8_D16>;
+defm DS_READ_U8_D16_HI : DS_Real_gfx11_gfx12<0x0a3, "ds_load_u8_d16_hi", DS_READ_U8_D16_HI>;
+defm DS_READ_I8_D16 : DS_Real_gfx11_gfx12<0x0a4, "ds_load_i8_d16", DS_READ_I8_D16>;
+defm DS_READ_I8_D16_HI : DS_Real_gfx11_gfx12<0x0a5, "ds_load_i8_d16_hi", DS_READ_I8_D16_HI>;
+defm DS_READ_U16_D16 : DS_Real_gfx11_gfx12<0x0a6, "ds_load_u16_d16", DS_READ_U16_D16>;
+defm DS_READ_U16_D16_HI : DS_Real_gfx11_gfx12<0x0a7, "ds_load_u16_d16_hi", DS_READ_U16_D16_HI>;
+defm DS_WRITE_ADDTID_B32 : DS_Real_gfx11_gfx12<0x0b0, "ds_store_addtid_b32", DS_WRITE_ADDTID_B32>;
+defm DS_READ_ADDTID_B32 : DS_Real_gfx11_gfx12<0x0b1, "ds_load_addtid_b32", DS_READ_ADDTID_B32>;
defm DS_WRITE_B96 : DS_Real_gfx11_gfx12<0x0de, "ds_store_b96">;
defm DS_WRITE_B128 : DS_Real_gfx11_gfx12<0x0df, "ds_store_b128">;
defm DS_READ_B96 : DS_Real_gfx11_gfx12<0x0fe, "ds_load_b96">;
@@ -1489,22 +1559,22 @@ defm DS_CMPSTORE_RTN_B64 : DS_Real_gfx11_gfx12<0x070>;
defm DS_CMPSTORE_RTN_F64 : DS_Real_gfx11<0x071>;
defm DS_ADD_RTN_F32 : DS_Real_gfx11_gfx12<0x079>;
-defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a>;
-defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b>;
-defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad>;
+defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a, DS_ADD_GS_REG_RTN>;
+defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b, DS_SUB_GS_REG_RTN>;
+defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad, DS_BVH_STACK_RTN_B32>;
//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
- multiclass DS_Real_gfx10<bits<8> op> {
+ multiclass DS_Real_gfx10<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME)> {
def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op,
- !cast<DS_Pseudo>(NAME), SIEncodingFamily.GFX10>;
+ ps, SIEncodingFamily.GFX10>;
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
-defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055>;
+defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055, DS_ADD_RTN_F32_gfx9>;
defm DS_WRITE_B8_D16_HI : DS_Real_gfx10<0x0a0>;
defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>;
defm DS_READ_U8_D16 : DS_Real_gfx10<0x0a2>;
@@ -1520,39 +1590,48 @@ defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>;
// GFX10, GFX11, GFX12.
//===----------------------------------------------------------------------===//
-multiclass DS_Real_gfx10_gfx11_gfx12<bits<8> op> :
- DS_Real_gfx10<op>, DS_Real_gfx11<op>, DS_Real_gfx12<op>;
+multiclass DS_Real_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+ DS_Real_gfx10<op, ps>,
+ DS_Real_gfx11<op, ps>,
+ DS_Real_gfx12<op, ps>;
-multiclass DS_Real_gfx10_gfx11<bits<8> op> :
- DS_Real_gfx10<op>, DS_Real_gfx11<op>;
+multiclass DS_Real_gfx10_gfx11<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+ DS_Real_gfx10<op, ps>, DS_Real_gfx11<op, ps>;
defm DS_ADD_F32 : DS_Real_gfx10_gfx11_gfx12<0x015>;
defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>;
-defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b2>;
-defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b3>;
+defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b2, DS_PERMUTE_B32>;
+defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b3, DS_BPERMUTE_B32>;
//===----------------------------------------------------------------------===//
// GFX7, GFX10, GFX11, GFX12.
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
- multiclass DS_Real_gfx7<bits<8> op> {
+ multiclass DS_Real_gfx7<bits<8> op, DS_Pseudo ps> {
def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op,
- !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>;
+ ps, SIEncodingFamily.SI>;
}
} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
-multiclass DS_Real_gfx7_gfx10_gfx11_gfx12<bits<8> op> :
- DS_Real_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>;
+multiclass DS_Real_gfx7_gfx10_gfx11_gfx12<bits<8> op,
+ DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+ DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+ DS_Real_gfx7<op, ps_gfx6>,
+ DS_Real_gfx10_gfx11_gfx12<op, ps_gfx9>;
-multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op> :
- DS_Real_gfx7<op>, DS_Real_gfx10_gfx11<op>;
+multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op,
+ DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+ DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+ DS_Real_gfx7<op, ps_gfx6>, DS_Real_gfx10_gfx11<op, ps_gfx9>;
-multiclass DS_Real_gfx7_gfx10<bits<8> op> :
- DS_Real_gfx7<op>, DS_Real_gfx10<op>;
+multiclass DS_Real_gfx7_gfx10<bits<8> op,
+ DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+ DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+ DS_Real_gfx7<op, ps_gfx6>, DS_Real_gfx10<op, ps_gfx9>;
// FIXME-GFX7: Add tests when upstreaming this part.
-defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018>;
+defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018, DS_GWS_SEMA_RELEASE_ALL, DS_GWS_SEMA_RELEASE_ALL>;
defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10_gfx11<0x034>;
defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10_gfx11_gfx12<0x07e>;
defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>;
@@ -1565,20 +1644,27 @@ defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>;
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
- multiclass DS_Real_gfx6_gfx7<bits<8> op> {
+ multiclass DS_Real_gfx6_gfx7<bits<8> op, DS_Pseudo ps> {
def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op,
- !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>;
+ ps, SIEncodingFamily.SI>;
}
} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
-multiclass DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> :
- DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>;
+multiclass DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op,
+ DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+ DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+ DS_Real_gfx6_gfx7<op, ps_gfx6>,
+ DS_Real_gfx10_gfx11_gfx12<op, ps_gfx9>;
-multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> :
- DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11<op>;
+multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op,
+ DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+ DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+ DS_Real_gfx6_gfx7<op, ps_gfx6>, DS_Real_gfx10_gfx11<op, ps_gfx9>;
-multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> :
- DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>;
+multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op,
+ DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+ DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+ DS_Real_gfx6_gfx7<op, ps_gfx6>, DS_Real_gfx10<op, ps_gfx9>;
defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>;
defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x001>;
@@ -1602,12 +1688,12 @@ defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>;
defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x012>;
defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x013>;
-defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014>;
-defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019>;
-defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a>;
-defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b>;
-defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01c>;
-defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d>;
+defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014, DS_NOP, DS_NOP>;
+defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019, DS_GWS_INIT, DS_GWS_INIT>;
+defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a, DS_GWS_SEMA_V, DS_GWS_SEMA_V>;
+defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b, DS_GWS_SEMA_BR, DS_GWS_SEMA_BR>;
+defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01c, DS_GWS_SEMA_P, DS_GWS_SEMA_P>;
+defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d, DS_GWS_BARRIER, DS_GWS_BARRIER>;
defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>;
defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>;
@@ -1634,7 +1720,7 @@ defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>;
defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x032>;
defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x033>;
-defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x035>;
+defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x035, DS_SWIZZLE_B32, DS_SWIZZLE_B32>;
defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>;
defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>;
@@ -1644,9 +1730,9 @@ defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>;
defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>;
defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>;
-defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03d>;
-defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03e>;
-defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f>;
+defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03d, DS_CONSUME, DS_CONSUME>;
+defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03e, DS_APPEND, DS_APPEND>;
+defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f, DS_ORDERED_COUNT, DS_ORDERED_COUNT>;
defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x040>;
defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x041>;
defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x042>;
@@ -1695,42 +1781,42 @@ defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x073>;
defm DS_READ_B64 : DS_Real_gfx6_gfx7_gfx10<0x076>;
defm DS_READ2_B64 : DS_Real_gfx6_gfx7_gfx10<0x077>;
defm DS_READ2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x078>;
-defm DS_ADD_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x080>;
-defm DS_SUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x081>;
-defm DS_RSUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x082>;
-defm DS_INC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x083>;
-defm DS_DEC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x084>;
-defm DS_MIN_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x085>;
-defm DS_MAX_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x086>;
-defm DS_MIN_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x087>;
-defm DS_MAX_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x088>;
-defm DS_AND_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x089>;
-defm DS_OR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08a>;
-defm DS_XOR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08b>;
-defm DS_WRITE_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08d>;
-defm DS_MIN_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x092>;
-defm DS_MAX_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x093>;
-defm DS_ADD_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c0>;
-defm DS_SUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c1>;
-defm DS_RSUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c2>;
-defm DS_INC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c3>;
-defm DS_DEC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c4>;
-defm DS_MIN_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c5>;
-defm DS_MAX_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c6>;
-defm DS_MIN_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c7>;
-defm DS_MAX_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c8>;
-defm DS_AND_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0c9>;
-defm DS_OR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0ca>;
-defm DS_XOR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cb>;
-defm DS_WRITE_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cd>;
-defm DS_MIN_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d2>;
-defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>;
+defm DS_ADD_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x080, DS_ADD_SRC2_U32, DS_ADD_SRC2_U32>;
+defm DS_SUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x081, DS_SUB_SRC2_U32, DS_SUB_SRC2_U32>;
+defm DS_RSUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x082, DS_RSUB_SRC2_U32, DS_RSUB_SRC2_U32>;
+defm DS_INC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x083, DS_INC_SRC2_U32, DS_INC_SRC2_U32>;
+defm DS_DEC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x084, DS_DEC_SRC2_U32, DS_DEC_SRC2_U32>;
+defm DS_MIN_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x085, DS_MIN_SRC2_I32, DS_MIN_SRC2_I32>;
+defm DS_MAX_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x086, DS_MAX_SRC2_I32, DS_MAX_SRC2_I32>;
+defm DS_MIN_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x087, DS_MIN_SRC2_U32, DS_MIN_SRC2_U32>;
+defm DS_MAX_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x088, DS_MAX_SRC2_U32, DS_MAX_SRC2_U32>;
+defm DS_AND_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x089, DS_AND_SRC2_B32, DS_AND_SRC2_B32>;
+defm DS_OR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08a, DS_OR_SRC2_B32, DS_OR_SRC2_B32>;
+defm DS_XOR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08b, DS_XOR_SRC2_B32, DS_XOR_SRC2_B32>;
+defm DS_WRITE_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08d, DS_WRITE_SRC2_B32, DS_WRITE_SRC2_B32>;
+defm DS_MIN_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x092, DS_MIN_SRC2_F32, DS_MIN_SRC2_F32>;
+defm DS_MAX_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x093, DS_MAX_SRC2_F32, DS_MAX_SRC2_F32>;
+defm DS_ADD_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c0, DS_ADD_SRC2_U64, DS_ADD_SRC2_U64>;
+defm DS_SUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c1, DS_SUB_SRC2_U64, DS_SUB_SRC2_U64>;
+defm DS_RSUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c2, DS_RSUB_SRC2_U64, DS_RSUB_SRC2_U64>;
+defm DS_INC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c3, DS_INC_SRC2_U64, DS_INC_SRC2_U64>;
+defm DS_DEC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c4, DS_DEC_SRC2_U64, DS_DEC_SRC2_U64>;
+defm DS_MIN_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c5, DS_MIN_SRC2_I64, DS_MIN_SRC2_I64>;
+defm DS_MAX_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c6, DS_MAX_SRC2_I64, DS_MAX_SRC2_I64>;
+defm DS_MIN_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c7, DS_MIN_SRC2_U64, DS_MIN_SRC2_U64>;
+defm DS_MAX_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c8, DS_MAX_SRC2_U64, DS_MAX_SRC2_U64>;
+defm DS_AND_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0c9, DS_AND_SRC2_B64, DS_AND_SRC2_B64>;
+defm DS_OR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0ca, DS_OR_SRC2_B64, DS_OR_SRC2_B64>;
+defm DS_XOR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cb, DS_XOR_SRC2_B64, DS_XOR_SRC2_B64>;
+defm DS_WRITE_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cd, DS_WRITE_SRC2_B64, DS_WRITE_SRC2_B64>;
+defm DS_MIN_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d2, DS_MIN_SRC2_F64, DS_MIN_SRC2_F64>;
+defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3, DS_MAX_SRC2_F64, DS_MAX_SRC2_F64>;
//===----------------------------------------------------------------------===//
// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
-class DS_Real_vi <bits<8> op, DS_Pseudo ps> :
+class DS_Real_Base_vi <bits<8> op, DS_Pseudo ps> :
DS_Real <ps>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
let AssemblerPredicate = isGFX8GFX9;
@@ -1749,181 +1835,210 @@ class DS_Real_vi <bits<8> op, DS_Pseudo ps> :
let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
}
-def DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>;
-def DS_SUB_U32_vi : DS_Real_vi<0x1, DS_SUB_U32>;
-def DS_RSUB_U32_vi : DS_Real_vi<0x2, DS_RSUB_U32>;
-def DS_INC_U32_vi : DS_Real_vi<0x3, DS_INC_U32>;
-def DS_DEC_U32_vi : DS_Real_vi<0x4, DS_DEC_U32>;
-def DS_MIN_I32_vi : DS_Real_vi<0x5, DS_MIN_I32>;
-def DS_MAX_I32_vi : DS_Real_vi<0x6, DS_MAX_I32>;
-def DS_MIN_U32_vi : DS_Real_vi<0x7, DS_MIN_U32>;
-def DS_MAX_U32_vi : DS_Real_vi<0x8, DS_MAX_U32>;
-def DS_AND_B32_vi : DS_Real_vi<0x9, DS_AND_B32>;
-def DS_OR_B32_vi : DS_Real_vi<0xa, DS_OR_B32>;
-def DS_XOR_B32_vi : DS_Real_vi<0xb, DS_XOR_B32>;
-def DS_MSKOR_B32_vi : DS_Real_vi<0xc, DS_MSKOR_B32>;
-def DS_WRITE_B32_vi : DS_Real_vi<0xd, DS_WRITE_B32>;
-def DS_WRITE2_B32_vi : DS_Real_vi<0xe, DS_WRITE2_B32>;
-def DS_WRITE2ST64_B32_vi : DS_Real_vi<0xf, DS_WRITE2ST64_B32>;
-def DS_CMPST_B32_vi : DS_Real_vi<0x10, DS_CMPST_B32>;
-def DS_CMPST_F32_vi : DS_Real_vi<0x11, DS_CMPST_F32>;
-def DS_MIN_F32_vi : DS_Real_vi<0x12, DS_MIN_F32>;
-def DS_MAX_F32_vi : DS_Real_vi<0x13, DS_MAX_F32>;
-def DS_NOP_vi : DS_Real_vi<0x14, DS_NOP>;
-def DS_ADD_F32_vi : DS_Real_vi<0x15, DS_ADD_F32>;
-def DS_GWS_INIT_vi : DS_Real_vi<0x99, DS_GWS_INIT>;
-def DS_GWS_SEMA_V_vi : DS_Real_vi<0x9a, DS_GWS_SEMA_V>;
-def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>;
-def DS_GWS_SEMA_P_vi : DS_Real_vi<0x9c, DS_GWS_SEMA_P>;
-def DS_GWS_BARRIER_vi : DS_Real_vi<0x9d, DS_GWS_BARRIER>;
-def DS_WRITE_ADDTID_B32_vi : DS_Real_vi<0x1d, DS_WRITE_ADDTID_B32>;
-def DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>;
-def DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>;
-def DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
-def DS_SUB_RTN_U32_vi : DS_Real_vi<0x21, DS_SUB_RTN_U32>;
-def DS_RSUB_RTN_U32_vi : DS_Real_vi<0x22, DS_RSUB_RTN_U32>;
-def DS_INC_RTN_U32_vi : DS_Real_vi<0x23, DS_INC_RTN_U32>;
-def DS_DEC_RTN_U32_vi : DS_Real_vi<0x24, DS_DEC_RTN_U32>;
-def DS_MIN_RTN_I32_vi : DS_Real_vi<0x25, DS_MIN_RTN_I32>;
-def DS_MAX_RTN_I32_vi : DS_Real_vi<0x26, DS_MAX_RTN_I32>;
-def DS_MIN_RTN_U32_vi : DS_Real_vi<0x27, DS_MIN_RTN_U32>;
-def DS_MAX_RTN_U32_vi : DS_Real_vi<0x28, DS_MAX_RTN_U32>;
-def DS_AND_RTN_B32_vi : DS_Real_vi<0x29, DS_AND_RTN_B32>;
-def DS_OR_RTN_B32_vi : DS_Real_vi<0x2a, DS_OR_RTN_B32>;
-def DS_XOR_RTN_B32_vi : DS_Real_vi<0x2b, DS_XOR_RTN_B32>;
-def DS_MSKOR_RTN_B32_vi : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>;
-def DS_WRXCHG_RTN_B32_vi : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>;
-def DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>;
-def DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>;
-def DS_CMPST_RTN_B32_vi : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
-def DS_CMPST_RTN_F32_vi : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
-def DS_MIN_RTN_F32_vi : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
-def DS_MAX_RTN_F32_vi : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
-def DS_WRAP_RTN_B32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_B32>;
-def DS_ADD_RTN_F32_vi : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
-def DS_READ_B32_vi : DS_Real_vi<0x36, DS_READ_B32>;
-def DS_READ2_B32_vi : DS_Real_vi<0x37, DS_READ2_B32>;
-def DS_READ2ST64_B32_vi : DS_Real_vi<0x38, DS_READ2ST64_B32>;
-def DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>;
-def DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>;
-def DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>;
-def DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>;
-def DS_READ_ADDTID_B32_vi : DS_Real_vi<0xb6, DS_READ_ADDTID_B32>;
-def DS_CONSUME_vi : DS_Real_vi<0xbd, DS_CONSUME>;
-def DS_APPEND_vi : DS_Real_vi<0xbe, DS_APPEND>;
-def DS_ORDERED_COUNT_vi : DS_Real_vi<0xbf, DS_ORDERED_COUNT>;
-def DS_SWIZZLE_B32_vi : DS_Real_vi<0x3d, DS_SWIZZLE_B32>;
-def DS_PERMUTE_B32_vi : DS_Real_vi<0x3e, DS_PERMUTE_B32>;
-def DS_BPERMUTE_B32_vi : DS_Real_vi<0x3f, DS_BPERMUTE_B32>;
-
-def DS_ADD_U64_vi : DS_Real_vi<0x40, DS_ADD_U64>;
-def DS_SUB_U64_vi : DS_Real_vi<0x41, DS_SUB_U64>;
-def DS_RSUB_U64_vi : DS_Real_vi<0x42, DS_RSUB_U64>;
-def DS_INC_U64_vi : DS_Real_vi<0x43, DS_INC_U64>;
-def DS_DEC_U64_vi : DS_Real_vi<0x44, DS_DEC_U64>;
-def DS_MIN_I64_vi : DS_Real_vi<0x45, DS_MIN_I64>;
-def DS_MAX_I64_vi : DS_Real_vi<0x46, DS_MAX_I64>;
-def DS_MIN_U64_vi : DS_Real_vi<0x47, DS_MIN_U64>;
-def DS_MAX_U64_vi : DS_Real_vi<0x48, DS_MAX_U64>;
-def DS_AND_B64_vi : DS_Real_vi<0x49, DS_AND_B64>;
-def DS_OR_B64_vi : DS_Real_vi<0x4a, DS_OR_B64>;
-def DS_XOR_B64_vi : DS_Real_vi<0x4b, DS_XOR_B64>;
-def DS_MSKOR_B64_vi : DS_Real_vi<0x4c, DS_MSKOR_B64>;
-def DS_WRITE_B64_vi : DS_Real_vi<0x4d, DS_WRITE_B64>;
-def DS_WRITE2_B64_vi : DS_Real_vi<0x4E, DS_WRITE2_B64>;
-def DS_WRITE2ST64_B64_vi : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>;
-def DS_CMPST_B64_vi : DS_Real_vi<0x50, DS_CMPST_B64>;
-def DS_CMPST_F64_vi : DS_Real_vi<0x51, DS_CMPST_F64>;
-def DS_MIN_F64_vi : DS_Real_vi<0x52, DS_MIN_F64>;
-def DS_MAX_F64_vi : DS_Real_vi<0x53, DS_MAX_F64>;
-
-def DS_WRITE_B8_D16_HI_vi : DS_Real_vi<0x54, DS_WRITE_B8_D16_HI>;
-def DS_WRITE_B16_D16_HI_vi : DS_Real_vi<0x55, DS_WRITE_B16_D16_HI>;
-
-def DS_READ_U8_D16_vi : DS_Real_vi<0x56, DS_READ_U8_D16>;
-def DS_READ_U8_D16_HI_vi : DS_Real_vi<0x57, DS_READ_U8_D16_HI>;
-def DS_READ_I8_D16_vi : DS_Real_vi<0x58, DS_READ_I8_D16>;
-def DS_READ_I8_D16_HI_vi : DS_Real_vi<0x59, DS_READ_I8_D16_HI>;
-def DS_READ_U16_D16_vi : DS_Real_vi<0x5a, DS_READ_U16_D16>;
-def DS_READ_U16_D16_HI_vi : DS_Real_vi<0x5b, DS_READ_U16_D16_HI>;
-
-def DS_ADD_RTN_U64_vi : DS_Real_vi<0x60, DS_ADD_RTN_U64>;
-def DS_SUB_RTN_U64_vi : DS_Real_vi<0x61, DS_SUB_RTN_U64>;
-def DS_RSUB_RTN_U64_vi : DS_Real_vi<0x62, DS_RSUB_RTN_U64>;
-def DS_INC_RTN_U64_vi : DS_Real_vi<0x63, DS_INC_RTN_U64>;
-def DS_DEC_RTN_U64_vi : DS_Real_vi<0x64, DS_DEC_RTN_U64>;
-def DS_MIN_RTN_I64_vi : DS_Real_vi<0x65, DS_MIN_RTN_I64>;
-def DS_MAX_RTN_I64_vi : DS_Real_vi<0x66, DS_MAX_RTN_I64>;
-def DS_MIN_RTN_U64_vi : DS_Real_vi<0x67, DS_MIN_RTN_U64>;
-def DS_MAX_RTN_U64_vi : DS_Real_vi<0x68, DS_MAX_RTN_U64>;
-def DS_AND_RTN_B64_vi : DS_Real_vi<0x69, DS_AND_RTN_B64>;
-def DS_OR_RTN_B64_vi : DS_Real_vi<0x6a, DS_OR_RTN_B64>;
-def DS_XOR_RTN_B64_vi : DS_Real_vi<0x6b, DS_XOR_RTN_B64>;
-def DS_MSKOR_RTN_B64_vi : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
-def DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
-def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
-def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
-def DS_CONDXCHG32_RTN_B64_vi : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>;
-def DS_GWS_SEMA_RELEASE_ALL_vi : DS_Real_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>;
-def DS_CMPST_RTN_B64_vi : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
-def DS_CMPST_RTN_F64_vi : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
-def DS_MIN_RTN_F64_vi : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
-def DS_MAX_RTN_F64_vi : DS_Real_vi<0x73, DS_MAX_RTN_F64>;
-
-def DS_READ_B64_vi : DS_Real_vi<0x76, DS_READ_B64>;
-def DS_READ2_B64_vi : DS_Real_vi<0x77, DS_READ2_B64>;
-def DS_READ2ST64_B64_vi : DS_Real_vi<0x78, DS_READ2ST64_B64>;
-
-def DS_ADD_SRC2_U32_vi : DS_Real_vi<0x80, DS_ADD_SRC2_U32>;
-def DS_SUB_SRC2_U32_vi : DS_Real_vi<0x81, DS_SUB_SRC2_U32>;
-def DS_RSUB_SRC2_U32_vi : DS_Real_vi<0x82, DS_RSUB_SRC2_U32>;
-def DS_INC_SRC2_U32_vi : DS_Real_vi<0x83, DS_INC_SRC2_U32>;
-def DS_DEC_SRC2_U32_vi : DS_Real_vi<0x84, DS_DEC_SRC2_U32>;
-def DS_MIN_SRC2_I32_vi : DS_Real_vi<0x85, DS_MIN_SRC2_I32>;
-def DS_MAX_SRC2_I32_vi : DS_Real_vi<0x86, DS_MAX_SRC2_I32>;
-def DS_MIN_SRC2_U32_vi : DS_Real_vi<0x87, DS_MIN_SRC2_U32>;
-def DS_MAX_SRC2_U32_vi : DS_Real_vi<0x88, DS_MAX_SRC2_U32>;
-def DS_AND_SRC2_B32_vi : DS_Real_vi<0x89, DS_AND_SRC2_B32>;
-def DS_OR_SRC2_B32_vi : DS_Real_vi<0x8a, DS_OR_SRC2_B32>;
-def DS_XOR_SRC2_B32_vi : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>;
-def DS_WRITE_SRC2_B32_vi : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>;
-def DS_MIN_SRC2_F32_vi : DS_Real_vi<0x92, DS_MIN_SRC2_F32>;
-def DS_MAX_SRC2_F32_vi : DS_Real_vi<0x93, DS_MAX_SRC2_F32>;
-def DS_ADD_SRC2_F32_vi : DS_Real_vi<0x95, DS_ADD_SRC2_F32>;
-def DS_ADD_SRC2_U64_vi : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>;
-def DS_SUB_SRC2_U64_vi : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>;
-def DS_RSUB_SRC2_U64_vi : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>;
-def DS_INC_SRC2_U64_vi : DS_Real_vi<0xc3, DS_INC_SRC2_U64>;
-def DS_DEC_SRC2_U64_vi : DS_Real_vi<0xc4, DS_DEC_SRC2_U64>;
-def DS_MIN_SRC2_I64_vi : DS_Real_vi<0xc5, DS_MIN_SRC2_I64>;
-def DS_MAX_SRC2_I64_vi : DS_Real_vi<0xc6, DS_MAX_SRC2_I64>;
-def DS_MIN_SRC2_U64_vi : DS_Real_vi<0xc7, DS_MIN_SRC2_U64>;
-def DS_MAX_SRC2_U64_vi : DS_Real_vi<0xc8, DS_MAX_SRC2_U64>;
-def DS_AND_SRC2_B64_vi : DS_Real_vi<0xc9, DS_AND_SRC2_B64>;
-def DS_OR_SRC2_B64_vi : DS_Real_vi<0xca, DS_OR_SRC2_B64>;
-def DS_XOR_SRC2_B64_vi : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>;
-def DS_WRITE_SRC2_B64_vi : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>;
-def DS_MIN_SRC2_F64_vi : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>;
-def DS_MAX_SRC2_F64_vi : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>;
-def DS_WRITE_B96_vi : DS_Real_vi<0xde, DS_WRITE_B96>;
-def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>;
-def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>;
-def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>;
+
+multiclass DS_Real_vi <bits<8> op, DS_Pseudo base_pseudo, bit need_gfx9_suffix = true> {
+ def "" : DS_Real_Base_vi<op, base_pseudo>;
+
+ if need_gfx9_suffix then {
+ def _gfx9 : DS_Real_Base_vi<op, !cast<DS_Pseudo>(!cast<string>(base_pseudo)#"_gfx9")> {
+ let DecoderNamespace = "GFX9";
+ }
+ }
+
+ // Handle cases that are available in all-AGPR or all-VGPR data
+ // operand forms. This should be used for all DS instructions with 2
+ // data operands.
+ defvar agpr_suffixed_name = !cast<string>(base_pseudo)#"_agpr";
+
+ if !exists<DS_Pseudo>(agpr_suffixed_name) then {
+ def _agpr : DS_Real_Base_vi<op, !cast<DS_Pseudo>(agpr_suffixed_name)> {
+ let DecoderNamespace = "GFX9";
+ let AssemblerPredicate = isGFX90APlus;
+ }
+ }
+}
+
+// Instructions which use m0 or not for both gfx8 and gfx9 (or did not
+// exist on gfx8)
+multiclass DS_Real_m0_vi<bits<8> op, DS_Pseudo ps> : DS_Real_vi<op, ps, false>;
+
+defm DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>;
+defm DS_SUB_U32_vi : DS_Real_vi<0x1, DS_SUB_U32>;
+defm DS_RSUB_U32_vi : DS_Real_vi<0x2, DS_RSUB_U32>;
+defm DS_INC_U32_vi : DS_Real_vi<0x3, DS_INC_U32>;
+defm DS_DEC_U32_vi : DS_Real_vi<0x4, DS_DEC_U32>;
+defm DS_MIN_I32_vi : DS_Real_vi<0x5, DS_MIN_I32>;
+defm DS_MAX_I32_vi : DS_Real_vi<0x6, DS_MAX_I32>;
+defm DS_MIN_U32_vi : DS_Real_vi<0x7, DS_MIN_U32>;
+defm DS_MAX_U32_vi : DS_Real_vi<0x8, DS_MAX_U32>;
+defm DS_AND_B32_vi : DS_Real_vi<0x9, DS_AND_B32>;
+defm DS_OR_B32_vi : DS_Real_vi<0xa, DS_OR_B32>;
+defm DS_XOR_B32_vi : DS_Real_vi<0xb, DS_XOR_B32>;
+defm DS_MSKOR_B32_vi : DS_Real_vi<0xc, DS_MSKOR_B32>;
+defm DS_WRITE_B32_vi : DS_Real_vi<0xd, DS_WRITE_B32>;
+defm DS_WRITE2_B32_vi : DS_Real_vi<0xe, DS_WRITE2_B32>;
+defm DS_WRITE2ST64_B32_vi : DS_Real_vi<0xf, DS_WRITE2ST64_B32>;
+
+defm DS_CMPST_B32_vi : DS_Real_vi<0x10, DS_CMPST_B32>;
+defm DS_CMPST_F32_vi : DS_Real_vi<0x11, DS_CMPST_F32>;
+defm DS_MIN_F32_vi : DS_Real_vi<0x12, DS_MIN_F32>;
+defm DS_MAX_F32_vi : DS_Real_vi<0x13, DS_MAX_F32>;
+defm DS_NOP_vi : DS_Real_m0_vi<0x14, DS_NOP>;
+defm DS_ADD_F32_vi : DS_Real_vi<0x15, DS_ADD_F32>;
+defm DS_GWS_INIT_vi : DS_Real_m0_vi<0x99, DS_GWS_INIT>;
+defm DS_GWS_SEMA_V_vi : DS_Real_m0_vi<0x9a, DS_GWS_SEMA_V>;
+defm DS_GWS_SEMA_BR_vi : DS_Real_m0_vi<0x9b, DS_GWS_SEMA_BR>;
+defm DS_GWS_SEMA_P_vi : DS_Real_m0_vi<0x9c, DS_GWS_SEMA_P>;
+defm DS_GWS_BARRIER_vi : DS_Real_m0_vi<0x9d, DS_GWS_BARRIER>;
+defm DS_WRITE_ADDTID_B32_vi: DS_Real_m0_vi<0x1d, DS_WRITE_ADDTID_B32>;
+defm DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>;
+defm DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>;
+defm DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
+defm DS_SUB_RTN_U32_vi : DS_Real_vi<0x21, DS_SUB_RTN_U32>;
+defm DS_RSUB_RTN_U32_vi : DS_Real_vi<0x22, DS_RSUB_RTN_U32>;
+defm DS_INC_RTN_U32_vi : DS_Real_vi<0x23, DS_INC_RTN_U32>;
+defm DS_DEC_RTN_U32_vi : DS_Real_vi<0x24, DS_DEC_RTN_U32>;
+defm DS_MIN_RTN_I32_vi : DS_Real_vi<0x25, DS_MIN_RTN_I32>;
+defm DS_MAX_RTN_I32_vi : DS_Real_vi<0x26, DS_MAX_RTN_I32>;
+defm DS_MIN_RTN_U32_vi : DS_Real_vi<0x27, DS_MIN_RTN_U32>;
+defm DS_MAX_RTN_U32_vi : DS_Real_vi<0x28, DS_MAX_RTN_U32>;
+defm DS_AND_RTN_B32_vi : DS_Real_vi<0x29, DS_AND_RTN_B32>;
+defm DS_OR_RTN_B32_vi : DS_Real_vi<0x2a, DS_OR_RTN_B32>;
+defm DS_XOR_RTN_B32_vi : DS_Real_vi<0x2b, DS_XOR_RTN_B32>;
+defm DS_MSKOR_RTN_B32_vi : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>;
+defm DS_WRXCHG_RTN_B32_vi : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>;
+defm DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>;
+defm DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>;
+defm DS_CMPST_RTN_B32_vi : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
+defm DS_CMPST_RTN_F32_vi : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
+defm DS_MIN_RTN_F32_vi : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
+defm DS_MAX_RTN_F32_vi : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
+defm DS_WRAP_RTN_B32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_B32>;
+defm DS_ADD_RTN_F32_vi : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
+defm DS_READ_B32_vi : DS_Real_vi<0x36, DS_READ_B32>;
+defm DS_READ2_B32_vi : DS_Real_vi<0x37, DS_READ2_B32>;
+defm DS_READ2ST64_B32_vi : DS_Real_vi<0x38, DS_READ2ST64_B32>;
+defm DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>;
+defm DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>;
+defm DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>;
+defm DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>;
+defm DS_READ_ADDTID_B32_vi : DS_Real_m0_vi<0xb6, DS_READ_ADDTID_B32>;
+defm DS_CONSUME_vi : DS_Real_m0_vi<0xbd, DS_CONSUME>;
+defm DS_APPEND_vi : DS_Real_m0_vi<0xbe, DS_APPEND>;
+defm DS_ORDERED_COUNT_vi : DS_Real_m0_vi<0xbf, DS_ORDERED_COUNT>;
+defm DS_SWIZZLE_B32_vi : DS_Real_m0_vi<0x3d, DS_SWIZZLE_B32>;
+defm DS_PERMUTE_B32_vi : DS_Real_m0_vi<0x3e, DS_PERMUTE_B32>;
+defm DS_BPERMUTE_B32_vi : DS_Real_m0_vi<0x3f, DS_BPERMUTE_B32>;
+
+defm DS_ADD_U64_vi : DS_Real_vi<0x40, DS_ADD_U64>;
+defm DS_SUB_U64_vi : DS_Real_vi<0x41, DS_SUB_U64>;
+defm DS_RSUB_U64_vi : DS_Real_vi<0x42, DS_RSUB_U64>;
+defm DS_INC_U64_vi : DS_Real_vi<0x43, DS_INC_U64>;
+defm DS_DEC_U64_vi : DS_Real_vi<0x44, DS_DEC_U64>;
+defm DS_MIN_I64_vi : DS_Real_vi<0x45, DS_MIN_I64>;
+defm DS_MAX_I64_vi : DS_Real_vi<0x46, DS_MAX_I64>;
+defm DS_MIN_U64_vi : DS_Real_vi<0x47, DS_MIN_U64>;
+defm DS_MAX_U64_vi : DS_Real_vi<0x48, DS_MAX_U64>;
+defm DS_AND_B64_vi : DS_Real_vi<0x49, DS_AND_B64>;
+defm DS_OR_B64_vi : DS_Real_vi<0x4a, DS_OR_B64>;
+defm DS_XOR_B64_vi : DS_Real_vi<0x4b, DS_XOR_B64>;
+defm DS_MSKOR_B64_vi : DS_Real_vi<0x4c, DS_MSKOR_B64>;
+defm DS_WRITE_B64_vi : DS_Real_vi<0x4d, DS_WRITE_B64>;
+defm DS_WRITE2_B64_vi : DS_Real_vi<0x4E, DS_WRITE2_B64>;
+defm DS_WRITE2ST64_B64_vi : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>;
+
+defm DS_CMPST_B64_vi : DS_Real_vi<0x50, DS_CMPST_B64>;
+defm DS_CMPST_F64_vi : DS_Real_vi<0x51, DS_CMPST_F64>;
+defm DS_MIN_F64_vi : DS_Real_vi<0x52, DS_MIN_F64>;
+defm DS_MAX_F64_vi : DS_Real_vi<0x53, DS_MAX_F64>;
+
+defm DS_WRITE_B8_D16_HI_vi : DS_Real_m0_vi<0x54, DS_WRITE_B8_D16_HI>;
+defm DS_WRITE_B16_D16_HI_vi: DS_Real_m0_vi<0x55, DS_WRITE_B16_D16_HI>;
+
+defm DS_READ_U8_D16_vi : DS_Real_m0_vi<0x56, DS_READ_U8_D16>;
+defm DS_READ_U8_D16_HI_vi : DS_Real_m0_vi<0x57, DS_READ_U8_D16_HI>;
+defm DS_READ_I8_D16_vi : DS_Real_m0_vi<0x58, DS_READ_I8_D16>;
+defm DS_READ_I8_D16_HI_vi : DS_Real_m0_vi<0x59, DS_READ_I8_D16_HI>;
+defm DS_READ_U16_D16_vi : DS_Real_m0_vi<0x5a, DS_READ_U16_D16>;
+defm DS_READ_U16_D16_HI_vi: DS_Real_m0_vi<0x5b, DS_READ_U16_D16_HI>;
+
+defm DS_ADD_RTN_U64_vi : DS_Real_vi<0x60, DS_ADD_RTN_U64>;
+defm DS_SUB_RTN_U64_vi : DS_Real_vi<0x61, DS_SUB_RTN_U64>;
+defm DS_RSUB_RTN_U64_vi : DS_Real_vi<0x62, DS_RSUB_RTN_U64>;
+defm DS_INC_RTN_U64_vi : DS_Real_vi<0x63, DS_INC_RTN_U64>;
+defm DS_DEC_RTN_U64_vi : DS_Real_vi<0x64, DS_DEC_RTN_U64>;
+defm DS_MIN_RTN_I64_vi : DS_Real_vi<0x65, DS_MIN_RTN_I64>;
+defm DS_MAX_RTN_I64_vi : DS_Real_vi<0x66, DS_MAX_RTN_I64>;
+defm DS_MIN_RTN_U64_vi : DS_Real_vi<0x67, DS_MIN_RTN_U64>;
+defm DS_MAX_RTN_U64_vi : DS_Real_vi<0x68, DS_MAX_RTN_U64>;
+defm DS_AND_RTN_B64_vi : DS_Real_vi<0x69, DS_AND_RTN_B64>;
+defm DS_OR_RTN_B64_vi : DS_Real_vi<0x6a, DS_OR_RTN_B64>;
+defm DS_XOR_RTN_B64_vi : DS_Real_vi<0x6b, DS_XOR_RTN_B64>;
+defm DS_MSKOR_RTN_B64_vi : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
+defm DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
+defm DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
+defm DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+defm DS_CONDXCHG32_RTN_B64_vi : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>;
+defm DS_GWS_SEMA_RELEASE_ALL_vi: DS_Real_m0_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>;
+defm DS_CMPST_RTN_B64_vi : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
+defm DS_CMPST_RTN_F64_vi : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
+defm DS_MIN_RTN_F64_vi : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
+defm DS_MAX_RTN_F64_vi : DS_Real_vi<0x73, DS_MAX_RTN_F64>;
+
+defm DS_READ_B64_vi : DS_Real_vi<0x76, DS_READ_B64>;
+defm DS_READ2_B64_vi : DS_Real_vi<0x77, DS_READ2_B64>;
+defm DS_READ2ST64_B64_vi : DS_Real_vi<0x78, DS_READ2ST64_B64>;
+
+defm DS_ADD_SRC2_U32_vi : DS_Real_m0_vi<0x80, DS_ADD_SRC2_U32>;
+defm DS_SUB_SRC2_U32_vi : DS_Real_m0_vi<0x81, DS_SUB_SRC2_U32>;
+defm DS_RSUB_SRC2_U32_vi : DS_Real_m0_vi<0x82, DS_RSUB_SRC2_U32>;
+defm DS_INC_SRC2_U32_vi : DS_Real_m0_vi<0x83, DS_INC_SRC2_U32>;
+defm DS_DEC_SRC2_U32_vi : DS_Real_m0_vi<0x84, DS_DEC_SRC2_U32>;
+defm DS_MIN_SRC2_I32_vi : DS_Real_m0_vi<0x85, DS_MIN_SRC2_I32>;
+defm DS_MAX_SRC2_I32_vi : DS_Real_m0_vi<0x86, DS_MAX_SRC2_I32>;
+defm DS_MIN_SRC2_U32_vi : DS_Real_m0_vi<0x87, DS_MIN_SRC2_U32>;
+defm DS_MAX_SRC2_U32_vi : DS_Real_m0_vi<0x88, DS_MAX_SRC2_U32>;
+defm DS_AND_SRC2_B32_vi : DS_Real_m0_vi<0x89, DS_AND_SRC2_B32>;
+defm DS_OR_SRC2_B32_vi : DS_Real_m0_vi<0x8a, DS_OR_SRC2_B32>;
+defm DS_XOR_SRC2_B32_vi : DS_Real_m0_vi<0x8b, DS_XOR_SRC2_B32>;
+defm DS_WRITE_SRC2_B32_vi : DS_Real_m0_vi<0x8d, DS_WRITE_SRC2_B32>;
+defm DS_MIN_SRC2_F32_vi : DS_Real_m0_vi<0x92, DS_MIN_SRC2_F32>;
+defm DS_MAX_SRC2_F32_vi : DS_Real_m0_vi<0x93, DS_MAX_SRC2_F32>;
+defm DS_ADD_SRC2_F32_vi : DS_Real_m0_vi<0x95, DS_ADD_SRC2_F32>;
+defm DS_ADD_SRC2_U64_vi : DS_Real_m0_vi<0xc0, DS_ADD_SRC2_U64>;
+defm DS_SUB_SRC2_U64_vi : DS_Real_m0_vi<0xc1, DS_SUB_SRC2_U64>;
+defm DS_RSUB_SRC2_U64_vi : DS_Real_m0_vi<0xc2, DS_RSUB_SRC2_U64>;
+defm DS_INC_SRC2_U64_vi : DS_Real_m0_vi<0xc3, DS_INC_SRC2_U64>;
+defm DS_DEC_SRC2_U64_vi : DS_Real_m0_vi<0xc4, DS_DEC_SRC2_U64>;
+defm DS_MIN_SRC2_I64_vi : DS_Real_m0_vi<0xc5, DS_MIN_SRC2_I64>;
+defm DS_MAX_SRC2_I64_vi : DS_Real_m0_vi<0xc6, DS_MAX_SRC2_I64>;
+defm DS_MIN_SRC2_U64_vi : DS_Real_m0_vi<0xc7, DS_MIN_SRC2_U64>;
+defm DS_MAX_SRC2_U64_vi : DS_Real_m0_vi<0xc8, DS_MAX_SRC2_U64>;
+defm DS_AND_SRC2_B64_vi : DS_Real_m0_vi<0xc9, DS_AND_SRC2_B64>;
+defm DS_OR_SRC2_B64_vi : DS_Real_m0_vi<0xca, DS_OR_SRC2_B64>;
+defm DS_XOR_SRC2_B64_vi : DS_Real_m0_vi<0xcb, DS_XOR_SRC2_B64>;
+defm DS_WRITE_SRC2_B64_vi : DS_Real_m0_vi<0xcd, DS_WRITE_SRC2_B64>;
+defm DS_MIN_SRC2_F64_vi : DS_Real_m0_vi<0xd2, DS_MIN_SRC2_F64>;
+defm DS_MAX_SRC2_F64_vi : DS_Real_m0_vi<0xd3, DS_MAX_SRC2_F64>;
+defm DS_WRITE_B96_vi : DS_Real_vi<0xde, DS_WRITE_B96>;
+defm DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>;
+defm DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>;
+defm DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>;
// GFX90A+.
-def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>;
-def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
+defm DS_ADD_F64_vi : DS_Real_m0_vi<0x5c, DS_ADD_F64>;
+defm DS_ADD_RTN_F64_vi: DS_Real_m0_vi<0x7c, DS_ADD_RTN_F64>;
// GFX942+.
-def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
-def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
-def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
-def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
+defm DS_PK_ADD_F16_vi : DS_Real_m0_vi<0x17, DS_PK_ADD_F16>;
+defm DS_PK_ADD_RTN_F16_vi : DS_Real_m0_vi<0xb7, DS_PK_ADD_RTN_F16>;
+defm DS_PK_ADD_BF16_vi : DS_Real_m0_vi<0x18, DS_PK_ADD_BF16>;
+defm DS_PK_ADD_RTN_BF16_vi: DS_Real_m0_vi<0xb8, DS_PK_ADD_RTN_BF16>;
//===----------------------------------------------------------------------===//
// GFX950.
//===----------------------------------------------------------------------===//
-def DS_READ_B64_TR_B4_vi : DS_Real_vi<0x0e0, DS_READ_B64_TR_B4>;
-def DS_READ_B96_TR_B6_vi : DS_Real_vi<0x0e1, DS_READ_B96_TR_B6>;
-def DS_READ_B64_TR_B8_vi : DS_Real_vi<0x0e2, DS_READ_B64_TR_B8>;
-def DS_READ_B64_TR_B16_vi : DS_Real_vi<0x0e3, DS_READ_B64_TR_B16>;
+defm DS_READ_B64_TR_B4_vi : DS_Real_m0_vi<0x0e0, DS_READ_B64_TR_B4>;
+defm DS_READ_B96_TR_B6_vi : DS_Real_m0_vi<0x0e1, DS_READ_B96_TR_B6>;
+defm DS_READ_B64_TR_B8_vi : DS_Real_m0_vi<0x0e2, DS_READ_B64_TR_B8>;
+defm DS_READ_B64_TR_B16_vi: DS_Real_m0_vi<0x0e3, DS_READ_B64_TR_B16>;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 6a2beeed41df..6f6039bf4ec2 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -38,6 +38,7 @@
#include "llvm/Support/Compiler.h"
using namespace llvm;
+using namespace llvm::MCD;
#define DEBUG_TYPE "amdgpu-disassembler"
@@ -446,6 +447,14 @@ static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
#include "AMDGPUGenDisassemblerTables.inc"
+namespace {
+// Define bitwidths for various types used to instantiate the decoder.
+template <> constexpr uint32_t InsnBitWidth<uint32_t> = 32;
+template <> constexpr uint32_t InsnBitWidth<uint64_t> = 64;
+template <> constexpr uint32_t InsnBitWidth<std::bitset<96>> = 96;
+template <> constexpr uint32_t InsnBitWidth<std::bitset<128>> = 128;
+} // namespace
+
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
@@ -498,26 +507,24 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
return Res;
}
-static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<96> eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+ using namespace llvm::support::endian;
assert(Bytes.size() >= 12);
- uint64_t Lo =
- support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+ std::bitset<96> Lo(read<uint64_t, endianness::little>(Bytes.data()));
Bytes = Bytes.slice(8);
- uint64_t Hi =
- support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
+ std::bitset<96> Hi(read<uint32_t, endianness::little>(Bytes.data()));
Bytes = Bytes.slice(4);
- return DecoderUInt128(Lo, Hi);
+ return (Hi << 64) | Lo;
}
-static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<128> eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+ using namespace llvm::support::endian;
assert(Bytes.size() >= 16);
- uint64_t Lo =
- support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+ std::bitset<128> Lo(read<uint64_t, endianness::little>(Bytes.data()));
Bytes = Bytes.slice(8);
- uint64_t Hi =
- support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+ std::bitset<128> Hi(read<uint64_t, endianness::little>(Bytes.data()));
Bytes = Bytes.slice(8);
- return DecoderUInt128(Lo, Hi);
+ return (Hi << 64) | Lo;
}
void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
@@ -600,14 +607,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
// encodings
if (isGFX1250() && Bytes.size() >= 16) {
- DecoderUInt128 DecW = eat16Bytes(Bytes);
+ std::bitset<128> DecW = eat16Bytes(Bytes);
if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
break;
Bytes = Bytes_.slice(0, MaxInstBytesNum);
}
if (isGFX11Plus() && Bytes.size() >= 12) {
- DecoderUInt128 DecW = eat12Bytes(Bytes);
+ std::bitset<96> DecW = eat12Bytes(Bytes);
if (isGFX11() &&
tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
@@ -642,7 +649,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
} else if (Bytes.size() >= 16 &&
STI.hasFeature(AMDGPU::FeatureGFX950Insts)) {
- DecoderUInt128 DecW = eat16Bytes(Bytes);
+ std::bitset<128> DecW = eat16Bytes(Bytes);
if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS))
break;
@@ -836,6 +843,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
+ // Validate buffer instruction offsets for GFX12+ - must not be a negative.
+ if (isGFX12Plus() && isBufferInstruction(MI)) {
+ int OffsetIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::offset);
+ if (OffsetIdx != -1) {
+ uint32_t Imm = MI.getOperand(OffsetIdx).getImm();
+ int64_t SignedOffset = SignExtend64<24>(Imm);
+ if (SignedOffset < 0)
+ return MCDisassembler::Fail;
+ }
+ }
+
if (MCII->get(MI.getOpcode()).TSFlags &
(SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) {
int SWZOpIdx =
@@ -1216,6 +1235,26 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
}
}
+// Given a wide tuple \p Reg check if it will overflow 256 registers.
+// \returns \p Reg on success or NoRegister otherwise.
+static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
+ const MCRegisterInfo &MRI) {
+ unsigned NumRegs = RC.getSizeInBits() / 32;
+ MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0);
+ if (!Sub0)
+ return Reg;
+
+ MCRegister BaseReg;
+ if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(Sub0))
+ BaseReg = AMDGPU::VGPR0;
+ else if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Sub0))
+ BaseReg = AMDGPU::AGPR0;
+
+ assert(BaseReg && "Only vector registers expected");
+
+ return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister;
+}
+
// Note that before gfx10, the MIMG encoding provided no information about
// VADDR size. Consequently, decoded instructions always show address as if it
// has 1 dword, which could be not really so.
@@ -1320,8 +1359,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
MCRegister VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
- NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
- &MRI.getRegClass(DataRCID));
+ const MCRegisterClass &NewRC = MRI.getRegClass(DataRCID);
+ NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, &NewRC);
+ NewVdata = CheckVGPROverflow(NewVdata, NewRC, MRI);
if (!NewVdata) {
// It's possible to encode this such that the low register + enabled
// components exceeds the register count.
@@ -1340,8 +1380,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
- NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
- &MRI.getRegClass(AddrRCID));
+ const MCRegisterClass &NewRC = MRI.getRegClass(AddrRCID);
+ NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &NewRC);
+ NewVAddrSA = CheckVGPROverflow(NewVAddrSA, NewRC, MRI);
if (!NewVAddrSA)
return;
}
@@ -2598,9 +2639,6 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
- if (isGFX1250())
- PRINT_DIRECTIVE(".amdhsa_uses_cu_stores",
- KERNEL_CODE_PROPERTY_USES_CU_STORES);
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
@@ -2743,6 +2781,20 @@ const MCExpr *AMDGPUDisassembler::createConstantSymbolExpr(StringRef Id,
return MCSymbolRefExpr::create(Sym, Ctx);
}
+bool AMDGPUDisassembler::isBufferInstruction(const MCInst &MI) const {
+ const uint64_t TSFlags = MCII->get(MI.getOpcode()).TSFlags;
+
+ // Check for MUBUF and MTBUF instructions
+ if (TSFlags & (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))
+ return true;
+
+ // Check for SMEM buffer instructions (S_BUFFER_* instructions)
+ if ((TSFlags & SIInstrFlags::SMRD) && AMDGPU::getSMEMIsBuffer(MI.getOpcode()))
+ return true;
+
+ return false;
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index f4d164bf10c3..c1131c2936fc 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -32,44 +32,6 @@ class MCOperand;
class MCSubtargetInfo;
class Twine;
-// Exposes an interface expected by autogenerated code in
-// FixedLenDecoderEmitter
-class DecoderUInt128 {
-private:
- uint64_t Lo = 0;
- uint64_t Hi = 0;
-
-public:
- DecoderUInt128() = default;
- DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {}
- operator bool() const { return Lo || Hi; }
- uint64_t extractBitsAsZExtValue(unsigned NumBits,
- unsigned BitPosition) const {
- assert(NumBits && NumBits <= 64);
- assert(BitPosition < 128);
- uint64_t Val;
- if (BitPosition < 64)
- Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition);
- else
- Val = Hi >> (BitPosition - 64);
- return Val & ((uint64_t(2) << (NumBits - 1)) - 1);
- }
- DecoderUInt128 operator&(const DecoderUInt128 &RHS) const {
- return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi);
- }
- DecoderUInt128 operator&(const uint64_t &RHS) const {
- return *this & DecoderUInt128(RHS);
- }
- DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); }
- bool operator==(const DecoderUInt128 &RHS) {
- return Lo == RHS.Lo && Hi == RHS.Hi;
- }
- bool operator!=(const DecoderUInt128 &RHS) {
- return Lo != RHS.Lo || Hi != RHS.Hi;
- }
- bool operator!=(const int &RHS) { return *this != DecoderUInt128(RHS); }
-};
-
//===----------------------------------------------------------------------===//
// AMDGPUDisassembler
//===----------------------------------------------------------------------===//
@@ -223,6 +185,9 @@ public:
bool hasKernargPreload() const;
bool isMacDPP(MCInst &MI) const;
+
+ /// Check if the instruction is a buffer operation (MUBUF, MTBUF, or S_BUFFER)
+ bool isBufferInstruction(const MCInst &MI) const;
};
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 280def5440c8..dadc7dcd7054 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -626,7 +626,6 @@ class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
let usesCustomInserter = 1;
let LDS_1A = 1;
- let DisableEncoding = "$dst";
}
class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
@@ -658,7 +657,6 @@ class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
let BaseOp = name;
let usesCustomInserter = 1;
- let DisableEncoding = "$dst";
}
class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
@@ -688,7 +686,6 @@ class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> :
let BaseOp = name;
let usesCustomInserter = 1;
- let DisableEncoding = "$dst";
}
def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f5d438436b29..a1306565bbe2 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -12,9 +12,11 @@ let WantsRoot = true in {
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>;
+ def GlobalSAddrNoIOffsetM0 : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffsetM0", [], [], -3>;
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
+ def GlobalSAddrCPolM0 : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPolM0", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
}
@@ -135,7 +137,18 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
// unsigned for flat accesses.
bits<13> offset;
// GFX90A+ only: instruction uses AccVGPR for data
- bits<1> acc = !if(ps.has_vdst, vdst{9}, !if(ps.has_data, vdata{9}, 0));
+ defvar DstOpIsAV = !if(ps.has_vdst,
+ VDstOperandIsAV<ps.OutOperandList>.ret, 0);
+ defvar DstOpIsAGPR = !if(ps.has_vdst,
+ VDstOperandIsAGPR<ps.OutOperandList>.ret, 0);
+ defvar DataOpIsAV = !if(ps.has_data,
+ VDataOperandIsAV<ps.InOperandList>.ret, 0);
+ defvar DataOpIsAGPR = !if(ps.has_data,
+ VDataOperandIsAGPR<ps.InOperandList>.ret, 0);
+
+ bits<1> acc = !if(ps.has_vdst,
+ !if(DstOpIsAV, vdst{9}, DstOpIsAGPR),
+ !if(DataOpIsAV, vdata{9}, DataOpIsAGPR));
// We don't use tfe right now, and it was removed in gfx9.
bits<1> tfe = 0;
@@ -214,11 +227,10 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
// same encoding value as exec_hi, so it isn't possible to use that if
// saddr is 32-bit (which isn't handled here yet).
class FLAT_Load_Pseudo<
- string opName, RegisterClass regClass, bit HasTiedOutput = 0,
+ string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0,
bit HasSaddr = 0, bit EnableSaddr = 0>
: FLAT_Pseudo<opName, (outs), (ins), ""> {
- defvar vdata_op = getLdStRegisterOperand<regClass>.ret;
let OutOperandList = (outs vdata_op:$vdst);
let InOperandList = !con(
!if(EnableSaddr,
@@ -239,10 +251,9 @@ class FLAT_Load_Pseudo<
let enabled_saddr = EnableSaddr;
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
- let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
-multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
+multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32, bit HasTiedInput = 0> {
def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>,
GlobalSaddrTable<0, opName>;
let OtherPredicates = [HasFlatGVSMode] in
@@ -251,19 +262,19 @@ multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasT
}
multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
- defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>;
+ defm "" : FLAT_Flat_Load_Pseudo<opName, AVLdSt_32, 1>;
let True16Predicate = UseRealTrue16Insts in
- defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
+ defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPROp_16>, True16D16Table<NAME#"_HI", NAME>;
}
-class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
+class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass,
bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs),
!con(
!if(EnableSaddr,
- (ins VGPR_32:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata, SReg_64_XEXEC_XNULL:$saddr),
- (ins VReg_64:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata)),
+ (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64_XEXEC_XNULL:$saddr),
+ (ins VReg_64:$vaddr, vdataClass:$vdata)),
(ins flat_offset:$offset, CPol_0:$cpol)),
" $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> {
let mayLoad = 0;
@@ -273,7 +284,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
let enabled_saddr = EnableSaddr;
}
-multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> {
+multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> {
def "" : FLAT_Store_Pseudo<opName, regClass>,
GlobalSaddrTable<0, opName>;
let OtherPredicates = [HasFlatGVSMode] in
@@ -282,21 +293,22 @@ multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> {
}
multiclass FLAT_Flat_Store_Pseudo_t16<string opName> {
- defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>;
+ defm "" : FLAT_Flat_Store_Pseudo<opName, AVLdSt_32>;
defvar Name16 = opName#"_t16";
let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in {
- def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>,
+ def _t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1>,
GlobalSaddrTable<0, Name16>,
True16D16Table<NAME#"_D16_HI", NAME>;
- def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+ def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1, 1>,
GlobalSaddrTable<1, Name16>,
True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
}
}
-multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
- let is_flat_global = 1 in {
+multiclass FLAT_Global_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32,
+ bit HasTiedInput = 0> {
+ let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
GlobalSaddrTable<0, opName>;
def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
@@ -305,21 +317,21 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha
}
multiclass FLAT_Global_Load_Pseudo_t16<string opName> {
- defm "" : FLAT_Global_Load_Pseudo<opName, VGPR_32, 1>;
+ defm "" : FLAT_Global_Load_Pseudo<opName, AVLdSt_32, 1>;
defvar Name16 = opName#"_t16";
let OtherPredicates = [HasTrue16BitInsts],
SubtargetPredicate = HasFlatGlobalInsts, is_flat_global = 1 in {
- def _t16 : FLAT_Load_Pseudo<Name16, VGPR_16, 0, 1>,
+ def _t16 : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1>,
GlobalSaddrTable<0, Name16>,
True16D16Table<NAME#"_HI", NAME>;
- def _SADDR_t16 : FLAT_Load_Pseudo<Name16, VGPR_16, 0, 1, 1>,
+ def _SADDR_t16 : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>,
GlobalSaddrTable<1, Name16>,
True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">;
}
}
-class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
+class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterOperand regClass,
bit HasTiedOutput = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
@@ -335,10 +347,9 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
let enabled_saddr = EnableSaddr;
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
- let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
-multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass,
+multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterOperand regClass,
bit HasTiedOutput = 0> {
def "" : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput>,
GlobalSaddrTable<0, opName>;
@@ -346,8 +357,8 @@ multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass,
GlobalSaddrTable<1, opName>;
}
-multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
- let is_flat_global = 1 in {
+multiclass FLAT_Global_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> {
+ let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
GlobalSaddrTable<0, opName>;
def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
@@ -356,15 +367,15 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
}
multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
- defm "" : FLAT_Global_Store_Pseudo<opName, VGPR_32>;
+ defm "" : FLAT_Global_Store_Pseudo<opName, AVLdSt_32>;
defvar Name16 = opName#"_t16";
let OtherPredicates = [HasTrue16BitInsts],
SubtargetPredicate = HasFlatGlobalInsts, is_flat_global = 1 in {
- def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>,
+ def _t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1>,
GlobalSaddrTable<0, Name16>,
True16D16Table<NAME#"_D16_HI", NAME>;
- def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+ def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1, 1>,
GlobalSaddrTable<1, Name16>,
True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
}
@@ -435,7 +446,7 @@ multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> {
GlobalSaddrTable<1, opName>;
}
-class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
+class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterOperand vdataClass,
bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs),
@@ -451,7 +462,7 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
let enabled_saddr = EnableSaddr;
}
-multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass> {
+multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterOperand regClass> {
def "" : FLAT_Global_Store_AddTid_Pseudo<opName, regClass>,
GlobalSaddrTable<0, opName>;
def _SADDR : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, 1>,
@@ -539,14 +550,14 @@ class FlatScratchInst <string sv_op, string mode> {
string Mode = mode;
}
-class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
+class FLAT_Scratch_Load_Pseudo <string opName, RegisterOperand regClass = AVLdSt_32,
bit HasTiedOutput = 0,
bit EnableSaddr = 0,
bit EnableSVE = 0,
bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))>
: FLAT_Pseudo<
opName,
- (outs getLdStRegisterOperand<regClass>.ret:$vdst),
+ (outs regClass:$vdst),
!con(
!if(EnableSVE,
(ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
@@ -555,7 +566,7 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
!if(EnableVaddr,
(ins VGPR_32:$vaddr, flat_offset:$offset),
(ins flat_offset:$offset)))),
- !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in),
+ !if(HasTiedOutput, (ins CPol:$cpol, regClass:$vdst_in),
(ins CPol_0:$cpol))),
" $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let is_flat_scratch = 1;
@@ -568,13 +579,11 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
let sve = EnableVaddr;
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
- let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
-class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0,
+class FLAT_Scratch_Store_Pseudo <string opName, RegisterOperand vdata_op, bit EnableSaddr = 0,
bit EnableSVE = 0,
- bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr)),
- RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo<
+ bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))> : FLAT_Pseudo<
opName,
(outs),
!if(EnableSVE,
@@ -596,7 +605,8 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
let sve = EnableVaddr;
}
-multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> {
+multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32,
+ bit HasTiedOutput = 0> {
def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>,
FlatScratchInst<opName, "SV">;
def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
@@ -612,29 +622,29 @@ multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit H
}
multiclass FLAT_Scratch_Load_Pseudo_t16<string opName> {
- defm "" : FLAT_Scratch_Load_Pseudo<opName, VGPR_32, 1>;
+ defm "" : FLAT_Scratch_Load_Pseudo<opName, AVLdSt_32, 1>;
defvar Name16 = opName#"_t16";
let OtherPredicates = [HasTrue16BitInsts], is_flat_scratch = 1 in {
- def _t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0>,
+ def _t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0>,
FlatScratchInst<Name16, "SV">,
True16D16Table<NAME#"_HI", NAME>;
- def _SADDR_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0, 1>,
+ def _SADDR_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0, 1>,
FlatScratchInst<Name16, "SS">,
True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">;
let SubtargetPredicate = HasFlatScratchSVSMode in
- def _SVS_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0, 1, 1>,
+ def _SVS_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>,
FlatScratchInst<Name16, "SVS">,
True16D16Table<NAME#"_HI_SVS", NAME#"_SVS">;
let SubtargetPredicate = HasFlatScratchSTMode in
- def _ST_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0, 0, 0, 0>,
+ def _ST_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0, 0, 0, 0>,
FlatScratchInst<Name16, "ST">,
True16D16Table<NAME#"_HI_ST", NAME#"_ST">;
}
}
-multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
+multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> {
def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>,
FlatScratchInst<opName, "SV">;
def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
@@ -650,24 +660,24 @@ multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
}
multiclass FLAT_Scratch_Store_Pseudo_t16<string opName> {
- defm "" : FLAT_Scratch_Store_Pseudo<opName, VGPR_32>;
+ defm "" : FLAT_Scratch_Store_Pseudo<opName, AVLdSt_32>;
defvar Name16 = opName#"_t16";
let OtherPredicates = [HasTrue16BitInsts], is_flat_scratch = 1 in {
- def _t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16>,
+ def _t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16>,
FlatScratchInst<Name16, "SV">,
True16D16Table<NAME#"_D16_HI", NAME>;
- def _SADDR_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16, 1>,
+ def _SADDR_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16, 1>,
FlatScratchInst<Name16, "SS">,
True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
let SubtargetPredicate = HasFlatScratchSVSMode in
- def _SVS_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+ def _SVS_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16, 1, 1>,
FlatScratchInst<Name16, "SVS">,
True16D16Table<NAME#"_D16_HI_SVS", NAME#"_SVS">;
let SubtargetPredicate = HasFlatScratchSTMode in
- def _ST_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16, 0, 0, 0>,
+ def _ST_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16, 0, 0, 0>,
FlatScratchInst<Name16, "ST">,
True16D16Table<NAME#"_D16_HI_ST", NAME#"_ST">;
}
@@ -741,11 +751,10 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
multiclass FLAT_Atomic_Pseudo_NO_RTN<
string opName,
- RegisterClass vdst_rc,
+ RegisterOperand vdst_op,
ValueType vt,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc,
- RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+ RegisterOperand data_op = vdst_op> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
(ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
@@ -770,15 +779,17 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
multiclass FLAT_Atomic_Pseudo_RTN<
string opName,
- RegisterClass vdst_rc,
+ RegisterOperand vdst_op,
ValueType vt,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc,
- RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
- RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
+ RegisterOperand data_op = vdst_op> {
+
+ defvar vdst_op_vgpr = getEquivalentVGPROperand<vdst_op>.ret;
+ defvar data_op_vgpr = getEquivalentVGPROperand<data_op>.ret;
+
def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_op:$vdst),
- (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ (outs vdst_op_vgpr:$vdst),
+ (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn"> {
let FPAtomic = data_vt.isFP;
@@ -786,8 +797,8 @@ multiclass FLAT_Atomic_Pseudo_RTN<
}
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_op:$vdst),
- (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+ (outs vdst_op_vgpr:$vdst),
+ (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
GlobalSaddrTable<1, opName#"_rtn"> {
let OtherPredicates = [HasFlatGVSMode];
@@ -797,26 +808,37 @@ multiclass FLAT_Atomic_Pseudo_RTN<
let FPAtomic = data_vt.isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
+
+ defvar vdst_op_agpr = getEquivalentAGPROperand<vdst_op>.ret;
+ defvar data_op_agpr = getEquivalentAGPROperand<data_op>.ret;
+
+ def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
+ (outs vdst_op_agpr:$vdst),
+ (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata$offset$cpol">,
+ GlobalSaddrTable<0, opName#"_rtn_agpr"> {
+ let FPAtomic = data_vt.isFP;
+ let AddedComplexity = -1; // Prefer global atomics if available
+ }
+ // No saddr agpr form. HasFlatGVSMode targets do not have AGPRs.
}
multiclass FLAT_Atomic_Pseudo<
string opName,
- RegisterClass vdst_rc,
+ RegisterOperand vdst_op,
ValueType vt,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc,
- RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
- defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>;
- defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>;
+ RegisterOperand data_op = vdst_op> {
+ defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_op, vt, data_vt, data_op>;
+ defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_op, vt, data_vt, data_op>;
}
multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
string opName,
- RegisterClass vdst_rc,
+ RegisterOperand vdst_op,
ValueType vt,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc,
- RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+ RegisterOperand data_op = vdst_op> {
let is_flat_global = 1 in {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
@@ -842,17 +864,18 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
multiclass FLAT_Global_Atomic_Pseudo_RTN<
string opName,
- RegisterClass vdst_rc,
+ RegisterOperand vdst_op,
ValueType vt,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc,
- RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
- RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
+ RegisterOperand data_op = vdst_op> {
+
+ defvar vdst_op_vgpr = getEquivalentVGPROperand<vdst_op>.ret;
+ defvar data_op_vgpr = getEquivalentVGPROperand<data_op>.ret;
let is_flat_global = 1 in {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_op:$vdst),
- (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ (outs vdst_op_vgpr:$vdst),
+ (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata, off$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn"> {
let has_saddr = 1;
@@ -860,23 +883,47 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
}
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_op:$vdst),
- (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+ (outs vdst_op_vgpr:$vdst),
+ (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
" $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
GlobalSaddrTable<1, opName#"_rtn"> {
let has_saddr = 1;
let enabled_saddr = 1;
let FPAtomic = data_vt.isFP;
}
+
+ defvar vdst_op_agpr = getEquivalentAGPROperand<vdst_op>.ret;
+ defvar data_op_agpr = getEquivalentAGPROperand<data_op>.ret;
+
+ let SubtargetPredicate = isGFX90APlus in {
+ def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
+ (outs vdst_op_agpr:$vdst),
+ (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata, off$offset$cpol">,
+ GlobalSaddrTable<0, opName#"_rtn_agpr"> {
+ let has_saddr = 1;
+ let FPAtomic = data_vt.isFP;
+ }
+
+ def _SADDR_RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
+ (outs vdst_op_agpr:$vdst),
+ (ins VGPR_32:$vaddr, data_op_agpr:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
+ GlobalSaddrTable<1, opName#"_rtn_agpr"> {
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let FPAtomic = data_vt.isFP;
+ }
+ }
}
}
multiclass FLAT_Global_Atomic_Pseudo<
string opName,
- RegisterClass vdst_rc,
+ RegisterOperand vdst_rc,
ValueType vt,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc> {
+ RegisterOperand data_rc = vdst_rc> {
defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
}
@@ -885,119 +932,119 @@ multiclass FLAT_Global_Atomic_Pseudo<
// Flat Instructions
//===----------------------------------------------------------------------===//
-defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
-defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
-defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>;
-defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>;
-defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>;
-defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
-defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
-defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte">;
+defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte">;
+defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort">;
+defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort">;
+defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword">;
+defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", AVLdSt_64>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", AVLdSt_128>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", AVLdSt_96>;
-defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>;
-defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
-defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
-defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword">;
+defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", AVLdSt_64>;
+defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", AVLdSt_128>;
+defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", AVLdSt_96>;
let SubtargetPredicate = HasD16LoadStore in {
let TiedSourceNotRead = 1 in {
-defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", AVLdSt_32, 1>;
defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
-defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", AVLdSt_32, 1>;
defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
-defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", AVLdSt_32, 1>;
defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">;
}
-defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
-defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
+defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi">;
+defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi">;
}
defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">;
defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">;
defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
- VGPR_32, i32, v2i32, VReg_64>;
+ AVLdSt_32, i32, v2i32, AVLdSt_64>;
defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2",
- VReg_64, i64, v2i64, VReg_128>;
+ AVLdSt_64, i64, v2i64, AVLdSt_128>;
defm FLAT_ATOMIC_SWAP : FLAT_Atomic_Pseudo <"flat_atomic_swap",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_SWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_ADD : FLAT_Atomic_Pseudo <"flat_atomic_add",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_SUB : FLAT_Atomic_Pseudo <"flat_atomic_sub",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_SMIN : FLAT_Atomic_Pseudo <"flat_atomic_smin",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_UMIN : FLAT_Atomic_Pseudo <"flat_atomic_umin",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_SMAX : FLAT_Atomic_Pseudo <"flat_atomic_smax",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_UMAX : FLAT_Atomic_Pseudo <"flat_atomic_umax",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_AND : FLAT_Atomic_Pseudo <"flat_atomic_and",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_OR : FLAT_Atomic_Pseudo <"flat_atomic_or",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_XOR : FLAT_Atomic_Pseudo <"flat_atomic_xor",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_INC : FLAT_Atomic_Pseudo <"flat_atomic_inc",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_DEC : FLAT_Atomic_Pseudo <"flat_atomic_dec",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm FLAT_ATOMIC_ADD_X2 : FLAT_Atomic_Pseudo <"flat_atomic_add_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_SUB_X2 : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_SMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_UMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_SMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_UMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_AND_X2 : FLAT_Atomic_Pseudo <"flat_atomic_and_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_OR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_or_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_XOR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
// GFX7-, GFX10-only flat instructions.
let SubtargetPredicate = isGFX7GFX10 in {
defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
- VReg_64, f64, v2f64, VReg_128>;
+ AVLdSt_64, f64, v2f64, AVLdSt_128>;
} // End SubtargetPredicate = isGFX7GFX10
@@ -1005,169 +1052,173 @@ defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
// choose this as the canonical name.
let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo <"flat_atomic_min_f64",
- VReg_64, f64>;
+ AVLdSt_64, f64>;
defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo <"flat_atomic_max_f64",
- VReg_64, f64>;
+ AVLdSt_64, f64>;
}
let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
-defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
-defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
+defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", AVLdSt_64, f64>;
+defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", AVLdSt_64, f64>;
}
let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
- defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
- defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
+ defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", AVLdSt_64, f64>;
+ defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", AVLdSt_64, f64>;
} // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in {
- defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>;
+ defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", AVLdSt_32, v2f16>;
let FPAtomic = 1 in
- defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2i16>;
+ defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", AVLdSt_32, v2i16>;
} // End SubtargetPredicate = HasAtomicFlatPkAdd16Insts
let SubtargetPredicate = HasAtomicGlobalPkAddBF16Inst, FPAtomic = 1 in
- defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2i16>;
+ defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", AVLdSt_32, v2i16>;
// GFX7-, GFX10-, GFX11-only flat instructions.
let SubtargetPredicate = isGFX7GFX10GFX11 in {
defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
- VGPR_32, f32, v2f32, VReg_64>;
+ AVLdSt_32, f32, v2f32, AVLdSt_64>;
defm FLAT_ATOMIC_FMIN : FLAT_Atomic_Pseudo <"flat_atomic_fmin",
- VGPR_32, f32>;
+ AVLdSt_32, f32>;
defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
- VGPR_32, f32>;
+ AVLdSt_32, f32>;
} // End SubtargetPredicate = isGFX7GFX10GFX11
// GFX942-, GFX11-only flat instructions.
let SubtargetPredicate = HasFlatAtomicFaddF32Inst in {
- defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>;
+ defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", AVLdSt_32, f32>;
} // End SubtargetPredicate = HasFlatAtomicFaddF32Inst
let SubtargetPredicate = isGFX12Plus in {
- defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPR_32, i32>;
- defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPR_32, i32>;
-} // End SubtargetPredicate = isGFX12Plus
+ defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPROp_32, i32>;
+ defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>;
+}
+
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in {
+ defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_NO_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>;
+}
-defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
-defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
-defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
-defm GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
-defm GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
-defm GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
-defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
-defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
+defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte">;
+defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte">;
+defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort">;
+defm GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort">;
+defm GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword">;
+defm GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", AVLdSt_64>;
+defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", AVLdSt_96>;
+defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", AVLdSt_128>;
let TiedSourceNotRead = 1 in {
-defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>;
-defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>;
-defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>;
+defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", AVLdSt_32, 1>;
+defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", AVLdSt_32, 1>;
+defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", AVLdSt_32, 1>;
defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo_t16 <"global_load_sbyte_d16">;
defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo_t16 <"global_load_short_d16">;
defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo_t16 <"global_load_ubyte_d16">;
}
-defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
-defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
+defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi">;
+defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi">;
let OtherPredicates = [HasGFX10_BEncoding] in
-defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>;
+defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPROp_32>;
defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo_t16 <"global_store_byte">;
defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo_t16 <"global_store_short">;
-defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
-defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
-defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
-defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword">;
+defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", AVLdSt_64>;
+defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", AVLdSt_96>;
+defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", AVLdSt_128>;
let OtherPredicates = [HasGFX10_BEncoding] in
-defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>;
+defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPROp_32>;
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap",
- VGPR_32, i32, v2i32, VReg_64>;
+ AVLdSt_32, i32, v2i32, AVLdSt_64>;
defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2",
- VReg_64, i64, v2i64, VReg_128>;
+ AVLdSt_64, i64, v2i64, AVLdSt_128>;
defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec",
- VGPR_32, i32>;
+ AVLdSt_32, i32>;
defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
- VReg_64, i64>;
+ AVLdSt_64, i64>;
let SubtargetPredicate = HasGFX10_BEncoding in {
defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo <"global_atomic_csub",
- VGPR_32, i32>;
+ VGPROp_32, i32>;
}
defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">;
@@ -1182,10 +1233,10 @@ defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwo
}
let SubtargetPredicate = isGFX12PlusNot12_50 in
- defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
+ defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VGPROp_64, i64>;
let SubtargetPredicate = isGFX12Plus in {
- defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>;
+ defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPROp_32, i32>;
def GLOBAL_INV : FLAT_Global_Invalidate_Writeback<"global_inv">;
def GLOBAL_WB : FLAT_Global_Invalidate_Writeback<"global_wb">;
@@ -1194,6 +1245,12 @@ let SubtargetPredicate = isGFX12Plus in {
let SubtargetPredicate = isGFX1250Plus in {
+let Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32 in {
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b8", 1>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b32", 1>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b64", 1>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b128", 1>;
+} // End Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32
defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>;
@@ -1207,33 +1264,33 @@ def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
} // End SubtargetPredicate = isGFX1250Plus
-defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>;
-defm SCRATCH_LOAD_SBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte", VGPR_32>;
-defm SCRATCH_LOAD_USHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort", VGPR_32>;
-defm SCRATCH_LOAD_SSHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_sshort", VGPR_32>;
-defm SCRATCH_LOAD_DWORD : FLAT_Scratch_Load_Pseudo <"scratch_load_dword", VGPR_32>;
-defm SCRATCH_LOAD_DWORDX2 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", VReg_64>;
-defm SCRATCH_LOAD_DWORDX3 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", VReg_96>;
-defm SCRATCH_LOAD_DWORDX4 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", VReg_128>;
+defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte">;
+defm SCRATCH_LOAD_SBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte">;
+defm SCRATCH_LOAD_USHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort">;
+defm SCRATCH_LOAD_SSHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_sshort">;
+defm SCRATCH_LOAD_DWORD : FLAT_Scratch_Load_Pseudo <"scratch_load_dword">;
+defm SCRATCH_LOAD_DWORDX2 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", AVLdSt_64>;
+defm SCRATCH_LOAD_DWORDX3 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", AVLdSt_96>;
+defm SCRATCH_LOAD_DWORDX4 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", AVLdSt_128>;
let TiedSourceNotRead = 1 in {
-defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32, 1>;
-defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32, 1>;
-defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32, 1>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", AVLdSt_32, 1>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", AVLdSt_32, 1>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", AVLdSt_32, 1>;
defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_ubyte_d16">;
defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_sbyte_d16">;
defm SCRATCH_LOAD_SHORT_D16 : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_short_d16">;
}
-defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>;
-defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>;
+defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi">;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi">;
defm SCRATCH_STORE_BYTE : FLAT_Scratch_Store_Pseudo_t16 <"scratch_store_byte">;
defm SCRATCH_STORE_SHORT : FLAT_Scratch_Store_Pseudo_t16 <"scratch_store_short">;
-defm SCRATCH_STORE_DWORD : FLAT_Scratch_Store_Pseudo <"scratch_store_dword", VGPR_32>;
-defm SCRATCH_STORE_DWORDX2 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx2", VReg_64>;
-defm SCRATCH_STORE_DWORDX3 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx3", VReg_96>;
-defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", VReg_128>;
+defm SCRATCH_STORE_DWORD : FLAT_Scratch_Store_Pseudo <"scratch_store_dword">;
+defm SCRATCH_STORE_DWORDX2 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx2", AVLdSt_64>;
+defm SCRATCH_STORE_DWORDX3 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx3", AVLdSt_96>;
+defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", AVLdSt_128>;
defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ubyte">;
defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sbyte">;
@@ -1242,69 +1299,77 @@ defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_s
defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
let SubtargetPredicate = isGFX125xOnly in {
-defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>;
-defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>;
-defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>;
+defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPROp_32>;
+defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VGPROp_64>;
+defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VGPROp_128>;
-defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>;
-defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>;
-defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
+defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPROp_32>;
+defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VGPROp_64>;
+defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VGPROp_128>;
} // End SubtargetPredicate = isGFX125xOnly
+let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in {
+let Uses = [M0, EXEC] in { // Use M0 for broadcast workgroup mask.
+defm CLUSTER_LOAD_B32 : FLAT_Global_Load_Pseudo <"cluster_load_b32", VGPROp_32>;
+defm CLUSTER_LOAD_B64 : FLAT_Global_Load_Pseudo <"cluster_load_b64", VGPROp_64>;
+defm CLUSTER_LOAD_B128 : FLAT_Global_Load_Pseudo <"cluster_load_b128", VGPROp_128>;
+} // End Uses = [M0, EXEC]
+} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32
+
let SubtargetPredicate = isGFX12Plus in {
let Uses = [EXEC, M0] in {
- defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
- defm GLOBAL_STORE_BLOCK : FLAT_Global_Store_Pseudo <"global_store_block", VReg_1024>;
+ defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VGPROp_1024>;
+ defm GLOBAL_STORE_BLOCK : FLAT_Global_Store_Pseudo <"global_store_block", VGPROp_1024>;
}
let Uses = [EXEC, FLAT_SCR, M0] in {
- defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VReg_1024>;
- defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VReg_1024>;
+ defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VGPROp_1024>;
+ defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VGPROp_1024>;
}
let WaveSizePredicate = isWave32 in {
- defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128", VReg_128>;
- defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64", VReg_64>;
+ defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128", VGPROp_128>;
+ defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64", VGPROp_64>;
}
} // End SubtargetPredicate = isGFX12Plus
let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
let Mnemonic = "global_load_tr_b128" in
- defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
+ defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VGPROp_64>;
let Mnemonic = "global_load_tr_b64" in
- defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
+ defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPROp_32>;
}
let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in {
- defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>;
- defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>;
+ defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96>;
+ defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VGPROp_64>;
}
let SubtargetPredicate = isGFX10Plus in {
defm GLOBAL_ATOMIC_FCMPSWAP :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", AVLdSt_32, f32, v2f32, AVLdSt_64>;
defm GLOBAL_ATOMIC_FMIN :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", AVLdSt_32, f32>;
defm GLOBAL_ATOMIC_FMAX :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", AVLdSt_32, f32>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
- FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>;
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", AVLdSt_64, f64, v2f64, AVLdSt_128>;
} // End SubtargetPredicate = isGFX10Plus
-let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
- "global_atomic_add_f32", VGPR_32, f32
+ "global_atomic_add_f32", AVLdSt_32, f32
>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
- "global_atomic_pk_add_f16", VGPR_32, v2f16
+ "global_atomic_pk_add_f16", AVLdSt_32, v2f16
>;
-let OtherPredicates = [HasAtomicFaddRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
- "global_atomic_add_f32", VGPR_32, f32
+ "global_atomic_add_f32", AVLdSt_32, f32
>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
- "global_atomic_pk_add_f16", VGPR_32, v2f16
+ "global_atomic_pk_add_f16", AVLdSt_32, v2f16
>;
let SubtargetPredicate = HasVmemPrefInsts in {
@@ -1362,6 +1427,16 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0),
+ (inst $dsaddr, $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadLDSSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0),
+ (inst $dsaddr, $saddr, $voffset, $offset, $cpol)
+>;
+
class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
(node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
(inst $dsaddr, $vaddr, $offset, $cpol)
@@ -1397,6 +1472,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol), M0)),
+ (inst $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)),
+ (inst $saddr, $voffset, $offset, $cpol)
+>;
+
class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
(inst $vaddr, $offset, $cpol)
@@ -1416,8 +1501,10 @@ class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
- (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)
->;
+ (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+}
class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
@@ -1443,19 +1530,24 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
-multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt,
+multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType vt,
ValueType data_vt = vt> {
-
+ defvar inst = !cast<FLAT_Pseudo>(base_inst_name);
+ defvar inst_saddr = !cast<FLAT_Pseudo>(inst#"_SADDR");
defvar noRtnNode = !cast<PatFrags>(node);
let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+ (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+ }
- def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node),
+ def : FlatAtomicSaddrPat<inst_saddr, !cast<SDPatternOperator>(node),
GlobalSAddr, vt, data_vt> {
let AddedComplexity = 9;
- let SubtargetPredicate = HasFlatGVSMode;
+ let SubtargetPredicate = inst_saddr.SubtargetPredicate;
+ let OtherPredicates = inst_saddr.OtherPredicates;
}
}
@@ -1468,17 +1560,22 @@ multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
-multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
+multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
ValueType data_vt = vt> {
-
+ defvar inst = !cast<FLAT_Pseudo>(inst_name#"_RTN");
+ defvar inst_saddr = !cast<FLAT_Pseudo>(inst_name#"_SADDR_RTN");
defvar rtnNode = !cast<SDPatternOperator>(node);
def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+ (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+ }
- def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> {
+ def : FlatAtomicSaddrPat<inst_saddr, rtnNode, GlobalSAddrGLC, vt, data_vt> {
let AddedComplexity = 8;
- let SubtargetPredicate = HasFlatGVSMode;
+ let SubtargetPredicate = inst_saddr.SubtargetPredicate;
+ let OtherPredicates = inst_saddr.OtherPredicates;
}
}
@@ -1514,8 +1611,10 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
->;
+ (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+}
multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, int complexity = 0,
@@ -1592,6 +1691,16 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
(inst $vaddr, $saddr, $offset, $cpol)
>;
+multiclass GlobalLoadLDSPats_M0<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatLoadLDSSignedPat_M0 <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadLDSSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
def : FlatLoadLDSSignedPat <inst, node> {
let AddedComplexity = 10;
@@ -1615,20 +1724,42 @@ multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat <inst, node, vt> {
let AddedComplexity = 10;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+ }
+}
+
+multiclass GlobalFLATLoadPats_M0<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadSignedPat_M0 <inst, node, vt> {
+ let AddedComplexity = 10;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+ }
+
+ def : GlobalLoadSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
}
multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat_CPOL<inst, node, vt> {
let AddedComplexity = 10;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
}
@@ -1655,10 +1786,14 @@ multiclass GlobalFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Value
multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> {
def : FlatStoreSignedPat <inst, node, vt> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
let AddedComplexity = 10;
}
def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
let AddedComplexity = 11;
}
}
@@ -1803,7 +1938,9 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
}
multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
- def : FlatLoadPat <inst, node, vt>;
+ def : FlatLoadPat <inst, node, vt> {
+ let OtherPredicates = [HasFlatAddressSpace];
+ }
def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 9;
@@ -1830,7 +1967,9 @@ multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueT
}
multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
- def : FlatStorePat <inst, node, vt>;
+ def : FlatStorePat <inst, node, vt> {
+ let OtherPredicates = [HasFlatAddressSpace];
+ }
def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 9;
@@ -1847,8 +1986,6 @@ multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType
}
}
-let OtherPredicates = [HasFlatAddressSpace] in {
-
defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
@@ -1898,6 +2035,7 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, atomic_load_nonext_128_flat, v4i32>;
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
@@ -1922,6 +2060,7 @@ defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX4, atomic_store_128_flat, v4i32>;
defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
@@ -1970,12 +2109,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
-} // End OtherPredicates = [HasFlatAddressSpace]
-
-let OtherPredicates = [isGFX12Plus] in
defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-
-let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in
defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
let OtherPredicates = [HasD16LoadStore] in {
@@ -2000,8 +2134,6 @@ defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
-let OtherPredicates = [HasFlatGlobalInsts] in {
-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_aext_16_global, i32>;
@@ -2015,7 +2147,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in {
+let True16Predicate = p in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
@@ -2029,7 +2161,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
}
-let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
+let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>;
@@ -2068,6 +2200,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX4, atomic_load_nonext_128_global, v4i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -2108,6 +2241,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, atomic_store_128_global, v4i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -2124,7 +2258,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_glo
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
-let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
@@ -2144,7 +2278,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
let SubtargetPredicate = isGFX12Plus in {
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
}
@@ -2179,6 +2313,15 @@ let OtherPredicates = [isGFX125xOnly] in {
} // End SubtargetPredicate = isGFX125xOnly
let OtherPredicates = [isGFX1250Plus] in {
+ defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B32, int_amdgcn_cluster_load_b32, i32>;
+ defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B64, int_amdgcn_cluster_load_b64, v2i32>;
+ defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B128, int_amdgcn_cluster_load_b128, v4i32>;
+
+ defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_cluster_load_async_to_lds_b8>;
+ defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_cluster_load_async_to_lds_b32>;
+ defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_cluster_load_async_to_lds_b64>;
+ defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_cluster_load_async_to_lds_b128>;
+
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
@@ -2190,62 +2333,38 @@ let OtherPredicates = [isGFX1250Plus] in {
defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
}
-let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-}
-
-let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
-}
-let OtherPredicates = [isGFX12Only] in {
- // FIXME: Remove these intrinsics
- defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
- defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
- defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
- defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
+// FIXME: Remove these intrinsics
+let SubtargetPredicate = isGFX12Only in {
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
}
-let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
-}
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
-}
-let OtherPredicates = [HasAtomicFaddRtnInsts] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
-}
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
-}
-let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-}
-let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
-}
-let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
-}
-
-let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
-}
-let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
-} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
@@ -2566,6 +2685,7 @@ multiclass FLAT_Real_Atomics_vi <bits<7> op,
defvar ps = !cast<FLAT_Pseudo>(NAME);
def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+ def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
}
multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
@@ -2573,8 +2693,10 @@ multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
FLAT_Real_AllAddr_vi<op, has_sccb> {
def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
-}
+ def _RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+ def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+}
defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40>;
defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_vi <0x41>;
@@ -3473,6 +3595,14 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+defm CLUSTER_LOAD_B32 : VFLAT_Real_AllAddr_gfx1250<0x067>;
+defm CLUSTER_LOAD_B64 : VFLAT_Real_AllAddr_gfx1250<0x068>;
+defm CLUSTER_LOAD_B128 : VFLAT_Real_AllAddr_gfx1250<0x069>;
+
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x6a>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x6b>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x6c>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x6d>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 184929a5a50f..8821558bb023 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -193,16 +193,6 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
return &OldOpnd;
}
-[[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
- MachineRegisterInfo &MRI) {
- int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
- if (RegClass == -1)
- return 0;
-
- const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
- return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass));
-}
-
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
RegSubRegPair CombOldVGPR,
@@ -250,7 +240,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
++NumOperands;
}
if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
- if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
+ if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::sdst)) {
DPPInst.add(*SDst);
++NumOperands;
}
@@ -295,12 +285,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(Src0);
- int Src0Idx = NumOperands;
- if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
- LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
- Fail = true;
- break;
- }
+ [[maybe_unused]] int Src0Idx = NumOperands;
+
DPPInst.add(*Src0);
DPPInst->getOperand(NumOperands).setIsKill(false);
++NumOperands;
@@ -319,21 +305,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
if (Src1) {
- int OpNum = NumOperands;
+ assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1) &&
+ "dpp version of instruction missing src1");
// If subtarget does not support SGPRs for src1 operand then the
// requirements are the same as for src0. We check src0 instead because
// pseudos are shared between subtargets and allow SGPR for src1 on all.
if (!ST->hasDPPSrc1SGPR()) {
- assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
- getOperandSize(*DPPInst, NumOperands, *MRI) &&
+ assert(TII->getOpSize(*DPPInst, Src0Idx) ==
+ TII->getOpSize(*DPPInst, NumOperands) &&
"Src0 and Src1 operands should have the same size");
- OpNum = Src0Idx;
- }
- if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
- LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
- Fail = true;
- break;
}
+
DPPInst.add(*Src1);
++NumOperands;
}
@@ -349,9 +331,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
if (Src2) {
- if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
- !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
- LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
+ if (!AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src2)) {
+ LLVM_DEBUG(dbgs() << " failed: dpp does not have src2\n");
Fail = true;
break;
}
@@ -431,6 +412,24 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
DPPInst.addImm(CombBCZ ? 1 : 0);
+
+ constexpr AMDGPU::OpName Srcs[] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
+
+ // FIXME: isOperandLegal expects to operate on an completely built
+ // instruction. We should have better legality APIs to check if the
+ // candidate operands will be legal without building the instruction first.
+ for (auto [I, OpName] : enumerate(Srcs)) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(DPPOp, OpName);
+ if (OpIdx == -1)
+ break;
+
+ if (!TII->isOperandLegal(*DPPInst, OpIdx)) {
+ LLVM_DEBUG(dbgs() << " failed: src" << I << " operand is illegal\n");
+ Fail = true;
+ break;
+ }
+ }
} while (false);
if (Fail) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 931966b6df1d..7b94ea3ffbf1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -577,6 +577,7 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
unsigned MaxNumVGPRs = MaxVectorRegs;
unsigned MaxNumAGPRs = 0;
+ unsigned NumArchVGPRs = has1024AddressableVGPRs() ? 1024 : 256;
// On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
// a wave may have up to 512 total vector registers combining together both
@@ -589,7 +590,6 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
if (hasGFX90AInsts()) {
unsigned MinNumAGPRs = 0;
const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
- const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
@@ -614,11 +614,11 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
- MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
+ MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, NumArchVGPRs);
MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
- MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
+ MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
"invalid register counts");
} else if (hasMAIInsts()) {
// On gfx908 the number of AGPRs always equals the number of VGPRs.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 2a8385df3f93..cbd6f64976d2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -198,6 +198,7 @@ protected:
bool DynamicVGPR = false;
bool DynamicVGPRBlockSize32 = false;
bool HasVMemToLDSLoad = false;
+ bool RequiresAlignVGPR = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
// for SRAMECC.
@@ -235,6 +236,7 @@ protected:
bool HasPseudoScalarTrans = false;
bool HasRestrictedSOffset = false;
bool Has64BitLiterals = false;
+ bool Has1024AddressableVGPRs = false;
bool HasBitOp3Insts = false;
bool HasTanhInsts = false;
bool HasTensorCvtLutInsts = false;
@@ -250,7 +252,6 @@ protected:
bool HasVmemPrefInsts = false;
bool HasSafeSmemPrefetch = false;
bool HasSafeCUPrefetch = false;
- bool HasCUStores = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -1015,8 +1016,6 @@ public:
bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
- bool hasCUStores() const { return HasCUStores; }
-
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1350,7 +1349,7 @@ public:
}
/// Return if operations acting on VGPR tuples require even alignment.
- bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
+ bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
/// Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasSPackHL() const { return GFX11Insts; }
@@ -1436,6 +1435,8 @@ public:
bool hasAddPC64Inst() const { return GFX1250Insts; }
+ bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; }
+
bool hasMinimum3Maximum3PKF16() const {
return HasMinimum3Maximum3PKF16;
}
@@ -1831,6 +1832,13 @@ public:
bool hasScratchBaseForwardingHazard() const {
return GFX1250Insts && getGeneration() == GFX12;
}
+
+ /// \returns true if the subtarget supports clusters of workgroups.
+ bool hasClusters() const { return GFX1250Insts; }
+
+ /// \returns true if the subtarget requires a wait for xcnt before atomic
+ /// flat/global stores & rmw.
+ bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index aafbdc2e86a9..f098e7a3c6c6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -80,12 +80,9 @@ void AMDGPUInstPrinter::printFP64ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
// KIMM64
- // This part needs to align with AMDGPUInstPrinter::printImmediate64.
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
uint64_t Imm = MI->getOperand(OpNo).getImm();
- if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Imm))
- O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')';
- else
- O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
+ printLiteral64(Desc, Imm, STI, O, /*IsFP=*/true);
}
void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo,
@@ -327,6 +324,54 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
}
}
+// \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or
+// \p Reg itself otherwise.
+static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+ unsigned Enc = MRI.getEncodingValue(Reg);
+ unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ if (Idx < 0x100)
+ return Reg;
+
+ const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
+ return RC->getRegister(Idx % 0x100);
+}
+
+// Restore MSBs of a VGPR above 255 from the MCInstrAnalysis.
+static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo,
+ const MCInstrDesc &Desc,
+ const MCRegisterInfo &MRI,
+ const AMDGPUMCInstrAnalysis &MIA) {
+ unsigned VgprMSBs = MIA.getVgprMSBs();
+ if (!VgprMSBs)
+ return Reg;
+
+ unsigned Enc = MRI.getEncodingValue(Reg);
+ if (!(Enc & AMDGPU::HWEncoding::IS_VGPR))
+ return Reg;
+
+ auto Ops = AMDGPU::getVGPRLoweringOperandTables(Desc);
+ if (!Ops.first)
+ return Reg;
+ unsigned Opc = Desc.getOpcode();
+ unsigned I;
+ for (I = 0; I < 4; ++I) {
+ if (Ops.first[I] != AMDGPU::OpName::NUM_OPERAND_NAMES &&
+ (unsigned)AMDGPU::getNamedOperandIdx(Opc, Ops.first[I]) == OpNo)
+ break;
+ if (Ops.second && Ops.second[I] != AMDGPU::OpName::NUM_OPERAND_NAMES &&
+ (unsigned)AMDGPU::getNamedOperandIdx(Opc, Ops.second[I]) == OpNo)
+ break;
+ }
+ if (I == 4)
+ return Reg;
+ unsigned OpMSBs = (VgprMSBs >> (I * 2)) & 3;
+ if (!OpMSBs)
+ return Reg;
+ if (MCRegister NewReg = AMDGPU::getVGPRWithMSBs(Reg, OpMSBs, MRI))
+ return NewReg;
+ return Reg;
+}
+
void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
const MCRegisterInfo &MRI) {
#if !defined(NDEBUG)
@@ -340,7 +385,20 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
}
#endif
- O << getRegisterName(Reg);
+ unsigned PrintReg = getRegForPrinting(Reg, MRI);
+ O << getRegisterName(PrintReg);
+
+ if (PrintReg != Reg.id())
+ O << " /*" << getRegisterName(Reg) << "*/";
+}
+
+void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, unsigned Opc,
+ unsigned OpNo, raw_ostream &O,
+ const MCRegisterInfo &MRI) {
+ if (MIA)
+ Reg = getRegFromMIA(Reg, OpNo, MII.get(Opc), MRI,
+ *static_cast<const AMDGPUMCInstrAnalysis *>(MIA));
+ printRegOperand(Reg, O, MRI);
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -594,7 +652,7 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
O << formatHex(static_cast<uint64_t>(Imm));
}
-void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
+void AMDGPUInstPrinter::printImmediate64(const MCInstrDesc &Desc, uint64_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O, bool IsFP) {
int64_t SImm = static_cast<int64_t>(Imm);
@@ -624,18 +682,24 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
else if (Imm == 0x3fc45f306dc9c882 &&
STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494309189532";
- else {
- // This part needs to align with AMDGPUOperand::addLiteralImmOperand.
- if (IsFP) {
- if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Imm))
- O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')';
- else
- O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
- return;
- }
+ else
+ printLiteral64(Desc, Imm, STI, O, IsFP);
+}
- if (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
- (!isInt<32>(Imm) || !isUInt<32>(Imm)))
+void AMDGPUInstPrinter::printLiteral64(const MCInstrDesc &Desc, uint64_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O, bool IsFP) {
+ // This part needs to align with AMDGPUOperand::addLiteralImmOperand.
+ bool CanUse64BitLiterals =
+ STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
+ !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
+ if (IsFP) {
+ if (CanUse64BitLiterals && Lo_32(Imm))
+ O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')';
+ else
+ O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
+ } else {
+ if (CanUse64BitLiterals && (!isInt<32>(Imm) || !isUInt<32>(Imm)))
O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')';
else
O << formatHex(static_cast<uint64_t>(Imm));
@@ -719,7 +783,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg()) {
- printRegOperand(Op.getReg(), O, MRI);
+ printRegOperand(Op.getReg(), MI->getOpcode(), OpNo, O, MRI);
// Check if operand register class contains register used.
// Intention: print disassembler message when invalid code is decoded,
@@ -750,12 +814,12 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
break;
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
- printImmediate64(Op.getImm(), STI, O, false);
+ printImmediate64(Desc, Op.getImm(), STI, O, false);
break;
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
- printImmediate64(Op.getImm(), STI, O, true);
+ printImmediate64(Desc, Op.getImm(), STI, O, true);
break;
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -793,22 +857,6 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
// custom printer.
llvm_unreachable("unexpected immediate operand type");
}
- } else if (Op.isDFPImm()) {
- double Value = bit_cast<double>(Op.getDFPImm());
- // We special case 0.0 because otherwise it will be printed as an integer.
- if (Value == 0.0)
- O << "0.0";
- else {
- const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- int RCID = Desc.operands()[OpNo].RegClass;
- unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
- if (RCBits == 32)
- printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O);
- else if (RCBits == 64)
- printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O, true);
- else
- llvm_unreachable("Invalid register class size");
- }
} else if (Op.isExpr()) {
const MCExpr *Exp = Op.getExpr();
MAI.printExpr(O, *Exp);
@@ -891,7 +939,7 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
if (OpNo + 1 < MI->getNumOperands() &&
(InputModifiers & SISrcMods::ABS) == 0) {
const MCOperand &Op = MI->getOperand(OpNo + 1);
- NegMnemo = Op.isImm() || Op.isDFPImm();
+ NegMnemo = Op.isImm();
}
if (NegMnemo) {
O << "neg(";
@@ -1146,7 +1194,7 @@ void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
OpNo = OpNo - N + N / 2;
if (En & (1 << N))
- printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
+ printRegOperand(MI->getOperand(OpNo).getReg(), Opc, OpNo, O, MRI);
else
O << "off";
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index be32061c6453..21cc2f229de9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -35,6 +35,8 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O) override;
static void printRegOperand(MCRegister Reg, raw_ostream &O,
const MCRegisterInfo &MRI);
+ void printRegOperand(MCRegister Reg, unsigned Opc, unsigned OpNo,
+ raw_ostream &O, const MCRegisterInfo &MRI);
private:
void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
@@ -70,7 +72,7 @@ private:
void printSymbolicFormat(const MCInst *MI,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printRegOperand(unsigned RegNo, raw_ostream &O);
+ void printRegOperand(MCRegister Reg, raw_ostream &O);
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -87,8 +89,10 @@ private:
raw_ostream &O);
void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
- raw_ostream &O, bool IsFP);
+ void printImmediate64(const MCInstrDesc &Desc, uint64_t Imm,
+ const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP);
+ void printLiteral64(const MCInstrDesc &Desc, uint64_t Imm,
+ const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP);
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printRegularOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 61f673221739..fd65f95334f7 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -88,7 +88,7 @@ private:
/// Encode an fp or int literal.
std::optional<uint64_t>
- getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
+ getLitEncoding(const MCInstrDesc &Desc, const MCOperand &MO, unsigned OpNo,
const MCSubtargetInfo &STI,
bool HasMandatoryLiteral = false) const;
@@ -219,8 +219,8 @@ static uint32_t getLit16IntEncoding(uint32_t Val, const MCSubtargetInfo &STI) {
return getLit32Encoding(Val, STI);
}
-static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI,
- bool IsFP) {
+static uint32_t getLit64Encoding(const MCInstrDesc &Desc, uint64_t Val,
+ const MCSubtargetInfo &STI, bool IsFP) {
uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
if (IntImm != 0)
return IntImm;
@@ -253,29 +253,27 @@ static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI,
STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
return 248;
- // The rest part needs to align with AMDGPUInstPrinter::printImmediate64.
+ // The rest part needs to align with AMDGPUInstPrinter::printLiteral64.
+ bool CanUse64BitLiterals =
+ STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
+ !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
if (IsFP) {
- return STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Val) ? 254
- : 255;
+ return CanUse64BitLiterals && Lo_32(Val) ? 254 : 255;
}
- return STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
- (!isInt<32>(Val) || !isUInt<32>(Val))
- ? 254
- : 255;
+ return CanUse64BitLiterals && (!isInt<32>(Val) || !isUInt<32>(Val)) ? 254
+ : 255;
}
std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
- const MCOperand &MO, const MCOperandInfo &OpInfo,
+ const MCInstrDesc &Desc, const MCOperand &MO, unsigned OpNo,
const MCSubtargetInfo &STI, bool HasMandatoryLiteral) const {
+ const MCOperandInfo &OpInfo = Desc.operands()[OpNo];
int64_t Imm;
if (MO.isExpr()) {
if (!MO.getExpr()->evaluateAsAbsolute(Imm))
- return (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
- OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64)
- ? 254
- : 255;
+ return AMDGPU::getOperandSize(OpInfo) == 8 ? 254 : 255;
} else {
assert(!MO.isDFPImm());
@@ -299,14 +297,14 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
- return getLit64Encoding(static_cast<uint64_t>(Imm), STI, false);
+ return getLit64Encoding(Desc, static_cast<uint64_t>(Imm), STI, false);
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
- return getLit64Encoding(static_cast<uint64_t>(Imm), STI, true);
+ return getLit64Encoding(Desc, static_cast<uint64_t>(Imm), STI, true);
case AMDGPU::OPERAND_REG_IMM_FP64: {
- auto Enc = getLit64Encoding(static_cast<uint64_t>(Imm), STI, true);
+ auto Enc = getLit64Encoding(Desc, static_cast<uint64_t>(Imm), STI, true);
return (HasMandatoryLiteral && Enc == 255) ? 254 : Enc;
}
@@ -405,7 +403,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) {
assert((Encoding & 0xFF) == 0);
Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO) &
- AMDGPU::HWEncoding::REG_IDX_MASK;
+ AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
}
for (unsigned i = 0; i < bytes; i++) {
@@ -447,7 +445,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
// Is this operand a literal immediate?
const MCOperand &Op = MI.getOperand(i);
- auto Enc = getLitEncoding(Op, Desc.operands()[i], STI);
+ auto Enc = getLitEncoding(Desc, Op, i, STI);
if (!Enc || (*Enc != 255 && *Enc != 254))
continue;
@@ -521,7 +519,7 @@ void AMDGPUMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
return;
} else {
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
- auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI);
+ auto Enc = getLitEncoding(Desc, MO, OpNo, STI);
if (Enc && *Enc != 255) {
Op = *Enc | SDWA9EncValues::SRC_SGPR_MASK;
return;
@@ -554,7 +552,7 @@ void AMDGPUMCCodeEmitter::getAVOperandEncoding(
SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
MCRegister Reg = MI.getOperand(OpNo).getReg();
unsigned Enc = MRI.getEncodingValue(Reg);
- unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ unsigned Idx = Enc & AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
bool IsVGPROrAGPR =
Enc & (AMDGPU::HWEncoding::IS_VGPR | AMDGPU::HWEncoding::IS_AGPR);
@@ -596,7 +594,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI,
const MCSubtargetInfo &STI) const {
if (MO.isReg()){
unsigned Enc = MRI.getEncodingValue(MO.getReg());
- unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ unsigned Idx = Enc & AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
bool IsVGPROrAGPR =
Enc & (AMDGPU::HWEncoding::IS_VGPR | AMDGPU::HWEncoding::IS_AGPR);
Op = Idx | (IsVGPROrAGPR << 8);
@@ -659,7 +657,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128(
const MCOperand &MO = MI.getOperand(OpNo);
if (MO.isReg()) {
uint16_t Encoding = MRI.getEncodingValue(MO.getReg());
- unsigned RegIdx = Encoding & AMDGPU::HWEncoding::REG_IDX_MASK;
+ unsigned RegIdx = Encoding & AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
bool IsHi = Encoding & AMDGPU::HWEncoding::IS_HI16;
bool IsVGPR = Encoding & AMDGPU::HWEncoding::IS_VGPR;
assert((!IsVGPR || isUInt<7>(RegIdx)) && "VGPR0-VGPR127 expected!");
@@ -695,11 +693,8 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
uint32_t Offset = Desc.getSize();
assert(Offset == 4 || Offset == 8);
- auto OpType = Desc.operands()[OpNo].OperandType;
- MCFixupKind Kind = (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
- OpType == AMDGPU::OPERAND_REG_IMM_INT64)
- ? FK_Data_8
- : FK_Data_4;
+ unsigned Size = AMDGPU::getOperandSize(Desc, OpNo);
+ MCFixupKind Kind = MCFixup::getDataKindForSize(Size);
addFixup(Fixups, Offset, MO.getExpr(), Kind, PCRel);
}
@@ -707,8 +702,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
bool HasMandatoryLiteral =
AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::imm);
- if (auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI,
- HasMandatoryLiteral)) {
+ if (auto Enc = getLitEncoding(Desc, MO, OpNo, STI, HasMandatoryLiteral)) {
Op = *Enc;
return;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index d66725d3a6c4..90c56f690146 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -21,9 +21,9 @@
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectWriter.h"
@@ -130,31 +130,35 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
std::move(Emitter));
}
-namespace {
-
-class AMDGPUMCInstrAnalysis : public MCInstrAnalysis {
-public:
- explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info)
- : MCInstrAnalysis(Info) {}
-
- bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
- uint64_t &Target) const override {
- if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() ||
- Info->get(Inst.getOpcode()).operands()[0].OperandType !=
- MCOI::OPERAND_PCREL)
- return false;
+namespace llvm {
+namespace AMDGPU {
+
+bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+ uint64_t Size,
+ uint64_t &Target) const {
+ if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() ||
+ Info->get(Inst.getOpcode()).operands()[0].OperandType !=
+ MCOI::OPERAND_PCREL)
+ return false;
+
+ int64_t Imm = Inst.getOperand(0).getImm();
+ // Our branches take a simm16.
+ Target = SignExtend64<16>(Imm) * 4 + Addr + Size;
+ return true;
+}
- int64_t Imm = Inst.getOperand(0).getImm();
- // Our branches take a simm16.
- Target = SignExtend64<16>(Imm) * 4 + Addr + Size;
- return true;
- }
-};
+void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) {
+ if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12)
+ VgprMSBs = Inst.getOperand(0).getImm();
+ else if (isTerminator(Inst))
+ VgprMSBs = 0;
+}
-} // end anonymous namespace
+} // end namespace AMDGPU
+} // end namespace llvm
static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) {
- return new AMDGPUMCInstrAnalysis(Info);
+ return new AMDGPU::AMDGPUMCInstrAnalysis(Info);
}
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 9c0b2da0fcb0..986388414096 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#include "llvm/MC/MCInstrAnalysis.h"
#include <cstdint>
#include <memory>
@@ -44,6 +45,28 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
std::unique_ptr<MCObjectTargetWriter>
createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
bool HasRelocationAddend);
+
+namespace AMDGPU {
+class AMDGPUMCInstrAnalysis : public MCInstrAnalysis {
+private:
+ unsigned VgprMSBs = 0;
+
+public:
+ explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info)
+ : MCInstrAnalysis(Info) {}
+
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+ uint64_t &Target) const override;
+
+ void resetState() override { VgprMSBs = 0; }
+
+ void updateState(const MCInst &Inst, uint64_t Addr) override;
+
+ unsigned getVgprMSBs() const { return VgprMSBs; }
+};
+
+} // namespace AMDGPU
+
} // namespace llvm
#define GET_REGINFO_ENUM
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 0bbab29dbda1..ff6a21239345 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -448,11 +448,6 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
".amdhsa_user_sgpr_private_segment_size");
- if (isGFX1250(STI))
- PrintField(KD.kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT,
- amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES,
- ".amdhsa_uses_cu_stores");
if (IVersion.Major >= 10)
PrintField(KD.kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index ff5321df6452..bf787b230067 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -420,7 +420,7 @@ class VSAMPLE_gfx12<int op, dag outs, int num_addrs, string dns="",
}
class MIMG_NoSampler_Helper <mimgopc op, string asm,
- RegisterClass dst_rc,
+ RegisterOperand dst_rc,
RegisterClass addr_rc,
string dns="">
: MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> {
@@ -433,10 +433,10 @@ class MIMG_NoSampler_Helper <mimgopc op, string asm,
}
class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
- RegisterClass dst_rc,
+ RegisterOperand dst_rc,
RegisterClass addr_rc,
string dns="">
- : MIMG_gfx90a <op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+ : MIMG_gfx90a <op.GFX10M, (outs getAlign2RegOp<dst_rc>.ret:$vdata), dns> {
let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, LWE:$lwe, DA:$da),
@@ -446,7 +446,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
}
class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
- RegisterClass DataRC, RegisterClass AddrRC,
+ RegisterOperand DataRC, RegisterClass AddrRC,
string dns="">
: MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -458,7 +458,7 @@ class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
}
class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
string dns="">
: MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = !con(AddrIns,
@@ -471,7 +471,7 @@ class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
}
class MIMG_NoSampler_gfx11<mimgopc op, string opcode,
- RegisterClass DataRC, RegisterClass AddrRC,
+ RegisterOperand DataRC, RegisterClass AddrRC,
string dns="">
: MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -483,7 +483,7 @@ class MIMG_NoSampler_gfx11<mimgopc op, string opcode,
}
class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
string dns="">
: MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = !con(AddrIns,
@@ -496,7 +496,7 @@ class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
}
class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
string dns="">
: VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = !con(AddrIns,
@@ -507,7 +507,7 @@ class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode,
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
+class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
int num_addrs, RegisterClass Addr3RC = VGPR_32,
string dns="">
: VSAMPLE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns, Addr3RC> {
@@ -544,7 +544,7 @@ class VSAMPLE_Sampler_nortn_gfx12<mimgopc op, string opcode,
}
multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
- RegisterClass dst_rc, bit enableDisasm,
+ RegisterOperand dst_rc, bit enableDisasm,
bit ExtendedImageInst = 1,
bit isVSample = 0> {
let VAddrDwords = 1 in {
@@ -578,7 +578,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
if op.HAS_GFX10M then {
def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
if !not(ExtendedImageInst) then
- def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>;
+ def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64_Align2>;
def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
}
@@ -602,7 +602,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
if op.HAS_GFX10M then {
def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
if !not(ExtendedImageInst) then
- def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>;
+ def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96_Align2>;
def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
}
@@ -626,7 +626,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
if op.HAS_GFX10M then {
def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
if !not(ExtendedImageInst) then
- def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>;
+ def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128_Align2>;
def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
!if(enableDisasm, "GFX10", "")>;
@@ -664,20 +664,20 @@ multiclass MIMG_NoSampler <mimgopc op, string asm, bit has_d16, bit mip = 0,
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
mayLoad = !not(isResInfo) in {
let VDataDwords = 1 in
- defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1, msaa>;
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_32, 1, msaa>;
let VDataDwords = 2 in
- defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0, msaa>;
+ defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_64, 0, msaa>;
let VDataDwords = 3 in
- defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0, msaa>;
+ defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_96, 0, msaa>;
let VDataDwords = 4 in
- defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0, msaa>;
+ defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_128, 0, msaa>;
let VDataDwords = 5 in
- defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0, msaa>;
+ defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_160, 0, msaa>;
}
}
class MIMG_Store_Helper <mimgopc op, string asm,
- RegisterClass data_rc,
+ RegisterOperand data_rc,
RegisterClass addr_rc,
string dns = "">
: MIMG_gfx6789<op.GFX10M, (outs), dns> {
@@ -690,11 +690,11 @@ class MIMG_Store_Helper <mimgopc op, string asm,
}
class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
- RegisterClass data_rc,
+ RegisterOperand data_rc,
RegisterClass addr_rc,
string dns = "">
: MIMG_gfx90a<op.GFX10M, (outs), dns> {
- let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+ let InOperandList = !con((ins getAlign2RegOp<data_rc>.ret:$vdata,
addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, LWE:$lwe, DA:$da),
@@ -704,7 +704,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
}
class MIMG_Store_gfx10<mimgopc op, string opcode,
- RegisterClass DataRC, RegisterClass AddrRC,
+ RegisterOperand DataRC, RegisterClass AddrRC,
string dns="">
: MIMG_gfx10<op.GFX10M, (outs), dns> {
let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
@@ -716,7 +716,7 @@ class MIMG_Store_gfx10<mimgopc op, string opcode,
}
class MIMG_Store_nsa_gfx10<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
string dns="">
: MIMG_nsa_gfx10<op.GFX10M, (outs), num_addrs, dns> {
let InOperandList = !con((ins DataRC:$vdata),
@@ -730,7 +730,7 @@ class MIMG_Store_nsa_gfx10<mimgopc op, string opcode,
}
class MIMG_Store_gfx11<mimgopc op, string opcode,
- RegisterClass DataRC, RegisterClass AddrRC,
+ RegisterOperand DataRC, RegisterClass AddrRC,
string dns="">
: MIMG_gfx11<op.GFX11, (outs), dns> {
let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
@@ -742,7 +742,7 @@ class MIMG_Store_gfx11<mimgopc op, string opcode,
}
class MIMG_Store_nsa_gfx11<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
string dns="">
: MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns> {
let InOperandList = !con((ins DataRC:$vdata),
@@ -756,7 +756,7 @@ class MIMG_Store_nsa_gfx11<mimgopc op, string opcode,
}
class VIMAGE_Store_gfx12<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
string dns="">
: VIMAGE_gfx12<op.GFX12, (outs), num_addrs, dns> {
let InOperandList = !con((ins DataRC:$vdata),
@@ -769,7 +769,7 @@ class VIMAGE_Store_gfx12<mimgopc op, string opcode,
}
multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
- RegisterClass data_rc,
+ RegisterOperand data_rc,
bit enableDisasm> {
let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0,
DisableWQM = 1 in {
@@ -797,7 +797,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
let ssamp = 0 in {
if op.HAS_GFX10M then {
def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
- def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>;
+ def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64_Align2>;
def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
}
@@ -814,7 +814,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
let ssamp = 0 in {
if op.HAS_GFX10M then {
def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
- def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>;
+ def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96_Align2>;
def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
}
@@ -831,7 +831,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
let ssamp = 0 in {
if op.HAS_GFX10M then {
def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
- def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>;
+ def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128_Align2>;
def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
!if(enableDisasm, "GFX10", "")>;
@@ -860,19 +860,19 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
let VDataDwords = 1 in
- defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+ defm _V1 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_32, 1>;
let VDataDwords = 2 in
- defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 0>;
+ defm _V2 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_64, 0>;
let VDataDwords = 3 in
- defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>;
+ defm _V3 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_96, 0>;
let VDataDwords = 4 in
- defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>;
+ defm _V4 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_128, 0>;
let VDataDwords = 5 in
- defm _V5 : MIMG_Store_Addr_Helper <op, asm, VReg_160, 0>;
+ defm _V5 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_160, 0>;
}
}
-class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc,
RegisterClass addr_rc, string dns="">
: MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
let Constraints = "$vdst = $vdata";
@@ -883,33 +883,33 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
}
-class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc,
RegisterClass addr_rc, string dns="">
- : MIMG_gfx90a <op, (outs getLdStRegisterOperand<data_rc>.ret:$vdst), dns> {
+ : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> {
let Constraints = "$vdst = $vdata";
- let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+ let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata,
addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, LWE:$lwe, DA:$da);
let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
}
-class MIMG_Atomic_si<mimgopc op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc,
RegisterClass addr_rc, bit enableDasm = 0>
: MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc,
!if(enableDasm, "GFX6GFX7", "")> {
let AssemblerPredicate = isGFX6GFX7;
}
-class MIMG_Atomic_vi<mimgopc op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc,
RegisterClass addr_rc, bit enableDasm = 0>
: MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
let AssemblerPredicate = isGFX8GFX9NotGFX90A;
let MIMGEncoding = MIMGEncGfx8;
}
-class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc,
RegisterClass addr_rc, bit enableDasm = 0>
: MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
let AssemblerPredicate = isGFX90APlus;
@@ -917,7 +917,7 @@ class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
}
class MIMG_Atomic_gfx10<mimgopc op, string opcode,
- RegisterClass DataRC, RegisterClass AddrRC,
+ RegisterOperand DataRC, RegisterClass AddrRC,
bit enableDisasm = 0>
: MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst),
!if(enableDisasm, "GFX10", "")> {
@@ -930,7 +930,7 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode,
}
class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
bit enableDisasm = 0>
: MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs,
!if(enableDisasm, "GFX10", "")> {
@@ -945,7 +945,7 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
}
class MIMG_Atomic_gfx11<mimgopc op, string opcode,
- RegisterClass DataRC, RegisterClass AddrRC,
+ RegisterOperand DataRC, RegisterClass AddrRC,
bit enableDisasm = 0>
: MIMG_gfx11<op.GFX11, (outs DataRC:$vdst),
!if(enableDisasm, "GFX11", "")> {
@@ -958,7 +958,7 @@ class MIMG_Atomic_gfx11<mimgopc op, string opcode,
}
class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
bit enableDisasm = 0>
: MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs,
!if(enableDisasm, "GFX11", "")> {
@@ -972,7 +972,7 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
-class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
+class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
int num_addrs, string renamed, bit enableDisasm = 0>
: VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs,
!if(enableDisasm, "GFX12", "")> {
@@ -987,7 +987,7 @@ class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
}
multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
- RegisterClass data_rc,
+ RegisterOperand data_rc,
bit enableDasm = 0,
bit isFP = 0,
string renamed = ""> {
@@ -1022,7 +1022,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
}
if op.HAS_VI then {
def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
- def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>;
+ def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>;
}
if op.HAS_GFX10M then {
def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
@@ -1044,7 +1044,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
}
if op.HAS_VI then {
def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
- def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>;
+ def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>;
}
if op.HAS_GFX10M then {
def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
@@ -1066,7 +1066,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
}
if op.HAS_VI then {
def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
- def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>;
+ def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>;
}
if op.HAS_GFX10M then {
def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
@@ -1105,19 +1105,19 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
// Other variants are reconstructed by disassembler using dmask and tfe.
if !not(isCmpSwap) then {
let VDataDwords = 1 in
- defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, VGPR_32, 1, isFP, renamed>;
+ defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>;
}
let VDataDwords = 2 in
- defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_64, isCmpSwap, isFP, renamed>;
+ defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>;
let VDataDwords = 3 in
- defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_96, 0, isFP, renamed>;
+ defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>;
if isCmpSwap then {
let VDataDwords = 4 in
- defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_128, 0, isFP, renamed>;
+ defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>;
let VDataDwords = 5 in
- defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_160, 0, isFP, renamed>;
+ defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>;
}
}
} // End IsAtomicRet = 1
@@ -1127,7 +1127,7 @@ multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
bit isCmpSwap = 0, bit isFP = 0>
: MIMG_Atomic <op, asm, isCmpSwap, isFP, renamed>;
-class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
+class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterOperand dst_rc,
RegisterClass src_rc, string dns="">
: MIMG_gfx6789 <op.VI, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp,
@@ -1138,9 +1138,9 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
+class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterOperand dst_rc,
RegisterClass src_rc, string dns="">
- : MIMG_gfx90a<op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+ : MIMG_gfx90a<op.GFX10M, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, LWE:$lwe, DA:$da),
@@ -1164,7 +1164,7 @@ class MIMG_Sampler_Asm_gfx10p<string opcode, string AsmPrefix, bit HasD16> {
}
class MIMG_Sampler_gfx10<mimgopc op, string opcode,
- RegisterClass DataRC, RegisterClass AddrRC,
+ RegisterOperand DataRC, RegisterClass AddrRC,
string dns="">
: MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
@@ -1172,7 +1172,7 @@ class MIMG_Sampler_gfx10<mimgopc op, string opcode,
}
class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
string dns="">
: MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = MIMG_Sampler_OpList_gfx10p<AddrIns, BaseOpcode.HasD16>.ret;
@@ -1200,7 +1200,7 @@ class MIMG_Sampler_nortn_nsa_gfx10<mimgopc op, string opcode,
}
class MIMG_Sampler_gfx11<mimgopc op, string opcode,
- RegisterClass DataRC, RegisterClass AddrRC,
+ RegisterOperand DataRC, RegisterClass AddrRC,
string dns="">
: MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
@@ -1208,7 +1208,7 @@ class MIMG_Sampler_gfx11<mimgopc op, string opcode,
}
class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
- RegisterClass DataRC, int num_addrs,
+ RegisterOperand DataRC, int num_addrs,
RegisterClass LastVAddrSize, string dns="">
: MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns, [],
LastVAddrSize> {
@@ -1345,7 +1345,7 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16,
}
multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
- AMDGPUSampleVariant sample, RegisterClass dst_rc,
+ AMDGPUSampleVariant sample, RegisterOperand dst_rc,
bit enableDisasm = 0,
bit ExtendedImageInst = 1, bit isG16 = 0> {
foreach addr = MIMG_Sampler_AddrSizes<sample, isG16>.MachineInstrs in {
@@ -1473,15 +1473,15 @@ multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit isPointSamp
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
mayLoad = !not(isGetLod) in {
let VDataDwords = 1 in
- defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1, ExtendedImageInst, isG16>;
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_32, 1, ExtendedImageInst, isG16>;
let VDataDwords = 2 in
- defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, 0, ExtendedImageInst, isG16>;
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_64, 0, ExtendedImageInst, isG16>;
let VDataDwords = 3 in
- defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96, 0, ExtendedImageInst, isG16>;
+ defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_96, 0, ExtendedImageInst, isG16>;
let VDataDwords = 4 in
- defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 0, ExtendedImageInst, isG16>;
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_128, 0, ExtendedImageInst, isG16>;
let VDataDwords = 5 in
- defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160, 0, ExtendedImageInst, isG16>;
+ defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_160, 0, ExtendedImageInst, isG16>;
}
if !not(isGetLod) then
@@ -1501,11 +1501,11 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
Gather4 = 1 in {
let VDataDwords = 2 in
- defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, /*enableDisasm*/ true>; /* for packed D16 only */
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_64, /*enableDisasm*/ true>; /* for packed D16 only */
let VDataDwords = 4 in
- defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_128>;
let VDataDwords = 5 in
- defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
+ defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_160>;
}
}
@@ -1632,13 +1632,13 @@ multiclass MIMG_MSAA_Load <mimgopc op, string asm> {
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
Gather4 = 1, hasPostISelHook = 0, mayLoad = 1 in {
let VDataDwords = 2 in
- defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0, 0, 1>; /* packed D16 */
+ defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_64, 0, 0, 1>; /* packed D16 */
let VDataDwords = 3 in
- defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0, 0, 1>; /* packed D16 + tfe */
+ defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_96, 0, 0, 1>; /* packed D16 + tfe */
let VDataDwords = 4 in
- defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1, 0, 1>;
+ defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_128, 1, 0, 1>;
let VDataDwords = 5 in
- defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0, 0, 1>;
+ defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_160, 0, 0, 1>;
}
}
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 8d27153fcfcd..3e256cce97af 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
#include "R600GenInstrInfo.inc"
R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
- : R600GenInstrInfo(-1, -1), RI(), ST(ST) {}
+ : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {}
bool R600InstrInfo::isVector(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index f82bd55beccc..dda0cf6a3218 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -123,7 +123,6 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
let HasNativeOperands = 1;
let Op1 = 1;
let ALUInst = 1;
- let DisableEncoding = "$literal";
let UseNamedOperandTable = 1;
let Inst{31-0} = Word0;
@@ -161,7 +160,6 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
let HasNativeOperands = 1;
let Op2 = 1;
let ALUInst = 1;
- let DisableEncoding = "$literal";
let UseNamedOperandTable = 1;
let Inst{31-0} = Word0;
@@ -201,7 +199,6 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
R600ALU_Word1_OP3<inst>{
let HasNativeOperands = 1;
- let DisableEncoding = "$literal";
let Op3 = 1;
let UseNamedOperandTable = 1;
let ALUInst = 1;
@@ -1783,7 +1780,7 @@ def : DwordAddrPat <i32, R600_Reg32>;
def getLDSNoRetOp : InstrMapping {
let FilterClass = "R600_LDS_1A1D";
let RowFields = ["BaseOp"];
- let ColFields = ["DisableEncoding"];
- let KeyCol = ["$dst"];
- let ValueCols = [[""""]];
+ let ColFields = ["usesCustomInserter"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 268b153c6c92..ecc4659ee0e8 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -237,16 +237,16 @@ enum OperandType : unsigned {
OPERAND_REG_INLINE_AC_FP32,
OPERAND_REG_INLINE_AC_FP64,
+ // Operand for AV_MOV_B64_IMM_PSEUDO, which is a pair of 32-bit inline
+ // constants. Does not accept registers.
+ OPERAND_INLINE_C_AV64_PSEUDO,
+
// Operand for source modifiers for VOP instructions
OPERAND_INPUT_MODS,
// Operand for SDWA instructions
OPERAND_SDWA_VOPC_DST,
- // Operand for AV_MOV_B64_IMM_PSEUDO, which is a pair of 32-bit inline
- // constants.
- OPERAND_INLINE_C_AV64_PSEUDO,
-
OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32,
@@ -254,7 +254,7 @@ enum OperandType : unsigned {
OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_FP64,
OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT32,
- OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_FP64,
+ OPERAND_REG_INLINE_AC_LAST = OPERAND_INLINE_C_AV64_PSEUDO,
OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -354,10 +354,11 @@ enum : unsigned {
// Register codes as defined in the TableGen's HWEncoding field.
namespace HWEncoding {
enum : unsigned {
- REG_IDX_MASK = 0xff,
- IS_VGPR = 1 << 8,
- IS_AGPR = 1 << 9,
- IS_HI16 = 1 << 10,
+ REG_IDX_MASK = 0x3ff,
+ LO256_REG_IDX_MASK = 0xff,
+ IS_VGPR = 1 << 10,
+ IS_AGPR = 1 << 11,
+ IS_HI16 = 1 << 12,
};
} // namespace HWEncoding
@@ -457,6 +458,8 @@ enum Id { // Message ID, width(4) [3:0].
ID_RTN_GET_TBA_TO_PC = 134,
ID_RTN_GET_SE_AID_ID = 135,
+ ID_RTN_GET_CLUSTER_BARRIER_STATE = 136, // added in GFX1250
+
ID_MASK_PreGFX11_ = 0xF,
ID_MASK_GFX11Plus_ = 0xFF
};
@@ -572,7 +575,17 @@ enum ModeRegisterMasks : uint32_t {
GPR_IDX_EN_MASK = 1 << 27,
VSKIP_MASK = 1 << 28,
- CSP_MASK = 0x7u << 29 // Bits 29..31
+ CSP_MASK = 0x7u << 29, // Bits 29..31
+
+ // GFX1250
+ DST_VGPR_MSB = 1 << 12,
+ SRC0_VGPR_MSB = 1 << 13,
+ SRC1_VGPR_MSB = 1 << 14,
+ SRC2_VGPR_MSB = 1 << 15,
+ VGPR_MSB_MASK = 0xf << 12, // Bits 12..15
+
+ REPLAY_MODE = 1 << 25,
+ FLAT_SCRATCH_IS_NV = 1 << 26,
};
} // namespace Hwreg
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index dce4e6f99300..6533d4c8eca3 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -627,6 +627,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
+ // Instructions to re-legalize after changing register classes
+ SmallVector<MachineInstr *, 8> Relegalize;
+
for (MachineBasicBlock &MBB : MF) {
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
++I) {
@@ -634,6 +637,11 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
switch (MI.getOpcode()) {
default:
+ // scale_src has a register class restricted to low 256 VGPRs, changing
+ // registers to VGPR may not take it into acount.
+ if (TII->isWMMA(MI) &&
+ AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::scale_src0))
+ Relegalize.push_back(&MI);
continue;
case AMDGPU::COPY: {
const TargetRegisterClass *SrcRC, *DstRC;
@@ -791,6 +799,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
for (auto *MI : PHINodes) {
processPHINode(*MI);
}
+ while (!Relegalize.empty())
+ TII->legalizeOperands(*Relegalize.pop_back_val(), MDT);
+
if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge)
hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 962c276bc212..5297816ec1f2 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -173,6 +173,7 @@ struct FoldCandidate {
class SIFoldOperandsImpl {
public:
+ MachineFunction *MF;
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
@@ -705,6 +706,36 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
MachineOperand *New = Fold.Def.OpToFold;
+
+ // Verify the register is compatible with the operand.
+ if (const TargetRegisterClass *OpRC =
+ TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) {
+ const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg());
+ const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg());
+ unsigned NewSubReg = New->getSubReg();
+ unsigned OldSubReg = Old.getSubReg();
+
+ const TargetRegisterClass *ConstrainRC = OpRC;
+ if (NewSubReg && OldSubReg) {
+ unsigned PreA, PreB;
+ ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC,
+ NewSubReg, PreA, PreB);
+ } else if (OldSubReg) {
+ ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg);
+ } else if (NewSubReg) {
+ ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg);
+ }
+
+ if (!ConstrainRC)
+ return false;
+
+ if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
+ LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
+ << TRI->getRegClassName(ConstrainRC) << '\n');
+ return false;
+ }
+ }
+
// Rework once the VS_16 register class is updated to include proper
// 16-bit SGPRs instead of 32-bit ones.
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
@@ -1248,6 +1279,7 @@ void SIFoldOperandsImpl::foldOperand(
if (FoldingImmLike && UseMI->isCopy()) {
Register DestReg = UseMI->getOperand(0).getReg();
Register SrcReg = UseMI->getOperand(1).getReg();
+ unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
assert(SrcReg.isVirtual());
const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
@@ -1259,63 +1291,74 @@ void SIFoldOperandsImpl::foldOperand(
return;
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
- if (!DestReg.isPhysical() && DestRC == &AMDGPU::AGPR_32RegClass) {
- std::optional<int64_t> UseImmVal = OpToFold.getEffectiveImmVal();
- if (UseImmVal && TII->isInlineConstant(
- *UseImmVal, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
- UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
- UseMI->getOperand(1).ChangeToImmediate(*UseImmVal);
- CopiesToReplace.push_back(UseMI);
- return;
+ // In order to fold immediates into copies, we need to change the copy to a
+ // MOV. Find a compatible mov instruction with the value.
+ for (unsigned MovOp :
+ {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
+ AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
+ AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
+ AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
+ const MCInstrDesc &MovDesc = TII->get(MovOp);
+ assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
+
+ const TargetRegisterClass *MovDstRC =
+ TRI->getRegClass(MovDesc.operands()[0].RegClass);
+
+ // Fold if the destination register class of the MOV instruction (ResRC)
+ // is a superclass of (or equal to) the destination register class of the
+ // COPY (DestRC). If this condition fails, folding would be illegal.
+ if (!DestRC->hasSuperClassEq(MovDstRC))
+ continue;
+
+ const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
+ const TargetRegisterClass *MovSrcRC =
+ TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass);
+ if (MovSrcRC) {
+ if (UseSubReg)
+ MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
+ if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
+ break;
+
+ // FIXME: This is mutating the instruction only and deferring the actual
+ // fold of the immediate
+ } else {
+ // For the _IMM_PSEUDO cases, there can be value restrictions on the
+ // immediate to verify. Technically we should always verify this, but it
+ // only matters for these concrete cases.
+ // TODO: Handle non-imm case if it's useful.
+ if (!OpToFold.isImm() ||
+ !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
+ break;
}
- }
- // Allow immediates COPYd into sgpr_lo16 to be further folded while
- // still being legal if not further folded
- if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
- assert(ST->useRealTrue16Insts());
- MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
- DestRC = &AMDGPU::SGPR_32RegClass;
+ MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
+ MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
+ while (ImpOpI != ImpOpE) {
+ MachineInstr::mop_iterator Tmp = ImpOpI;
+ ImpOpI++;
+ UseMI->removeOperand(UseMI->getOperandNo(Tmp));
+ }
+ UseMI->setDesc(MovDesc);
+
+ if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
+ const auto &SrcOp = UseMI->getOperand(UseOpIdx);
+ MachineOperand NewSrcOp(SrcOp);
+ MachineFunction *MF = UseMI->getParent()->getParent();
+ UseMI->removeOperand(1);
+ UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
+ UseMI->addOperand(NewSrcOp); // src0
+ UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
+ UseOpIdx = SrcIdx;
+ UseOp = &UseMI->getOperand(UseOpIdx);
+ }
+ CopiesToReplace.push_back(UseMI);
+ break;
}
- // In order to fold immediates into copies, we need to change the
- // copy to a MOV.
-
- unsigned MovOp = TII->getMovOpcode(DestRC);
- if (MovOp == AMDGPU::COPY)
- return;
-
- // Fold if the destination register class of the MOV instruction (ResRC)
- // is a superclass of (or equal to) the destination register class of the
- // COPY (DestRC). If this condition fails, folding would be illegal.
- const MCInstrDesc &MovDesc = TII->get(MovOp);
- assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
- const TargetRegisterClass *ResRC =
- TRI->getRegClass(MovDesc.operands()[0].RegClass);
- if (!DestRC->hasSuperClassEq(ResRC))
+ // We failed to replace the copy, so give up.
+ if (UseMI->getOpcode() == AMDGPU::COPY)
return;
- MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
- MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
- while (ImpOpI != ImpOpE) {
- MachineInstr::mop_iterator Tmp = ImpOpI;
- ImpOpI++;
- UseMI->removeOperand(UseMI->getOperandNo(Tmp));
- }
- UseMI->setDesc(TII->get(MovOp));
-
- if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
- const auto &SrcOp = UseMI->getOperand(UseOpIdx);
- MachineOperand NewSrcOp(SrcOp);
- MachineFunction *MF = UseMI->getParent()->getParent();
- UseMI->removeOperand(1);
- UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
- UseMI->addOperand(NewSrcOp); // src0
- UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
- UseOpIdx = 2;
- UseOp = &UseMI->getOperand(UseOpIdx);
- }
- CopiesToReplace.push_back(UseMI);
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
UseMI->getOperand(0).getReg().isVirtual() &&
@@ -1430,30 +1473,9 @@ void SIFoldOperandsImpl::foldOperand(
return;
}
- if (!FoldingImmLike) {
- if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
- // Don't fold if OpToFold doesn't hold an aligned register.
- const TargetRegisterClass *RC =
- TRI->getRegClassForReg(*MRI, OpToFold.getReg());
- assert(RC);
- if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
- unsigned SubReg = OpToFold.getSubReg();
- if (const TargetRegisterClass *SubRC =
- TRI->getSubRegisterClass(RC, SubReg))
- RC = SubRC;
- }
-
- if (!RC || !TRI->isProperlyAlignedRC(*RC))
- return;
- }
-
- tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
-
- // FIXME: We could try to change the instruction from 64-bit to 32-bit
- // to enable more folding opportunities. The shrink operands pass
- // already does this.
- return;
- }
+ // FIXME: We could try to change the instruction from 64-bit to 32-bit
+ // to enable more folding opportunities. The shrink operands pass
+ // already does this.
tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
}
@@ -1931,8 +1953,10 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
// Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
// creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
// later, create a copy here and track if we already have such a copy.
- if (TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg) !=
- VGPRUseSubRC) {
+ const TargetRegisterClass *SubRC =
+ TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
+ if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
+ // TODO: Try to reconstrain class
VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
B.addReg(VGPRCopy);
@@ -2748,6 +2772,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
}
bool SIFoldOperandsImpl::run(MachineFunction &MF) {
+ this->MF = &MF;
MRI = &MF.getRegInfo();
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 9b348d46fec4..ce25bf499c41 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1132,9 +1132,18 @@ void SIFrameLowering::emitCSRSpillRestores(
RestoreWWMRegisters(WWMCalleeSavedRegs);
// The original EXEC is the first operand of the return instruction.
- const MachineInstr &Return = MBB.instr_back();
- assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
- "Unexpected return inst");
+ MachineInstr &Return = MBB.instr_back();
+ unsigned Opcode = Return.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
+ Opcode = AMDGPU::SI_RETURN;
+ break;
+ case AMDGPU::SI_TCRETURN_GFX_WholeWave:
+ Opcode = AMDGPU::SI_TCRETURN_GFX;
+ break;
+ default:
+ llvm_unreachable("Unexpected return inst");
+ }
Register OrigExec = Return.getOperand(0).getReg();
if (!WWMScratchRegs.empty()) {
@@ -1148,6 +1157,11 @@ void SIFrameLowering::emitCSRSpillRestores(
// Restore original EXEC.
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+
+ // Drop the first operand and update the opcode.
+ Return.removeOperand(0);
+ Return.setDesc(TII->get(Opcode));
+
return;
}
@@ -1728,7 +1742,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
"Whole wave functions can use the reg mapped for their i1 argument");
// FIXME: Be more efficient!
- for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
+ unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
+ for (MCRegister Reg :
+ AMDGPU::VGPR_32RegClass.getRegisters().take_front(NumArchVGPRs))
if (MF.getRegInfo().isPhysRegModified(Reg)) {
MFI->reserveWWMRegister(Reg);
MF.begin()->addLiveIn(Reg);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 66c1dfc71c2f..2a977247bc2c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1263,22 +1263,61 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
static unsigned getIntrMemWidth(unsigned IntrID) {
switch (IntrID) {
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
return 8;
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
return 32;
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
return 64;
case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
return 128;
default:
llvm_unreachable("Unknown width");
}
}
+static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
+ TargetLoweringBase::IntrinsicInfo &Info) {
+ Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
+ unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
+ switch (AtomicOrderingCABI(Ord)) {
+ case AtomicOrderingCABI::acquire:
+ Info.order = AtomicOrdering::Acquire;
+ break;
+ case AtomicOrderingCABI::release:
+ Info.order = AtomicOrdering::Release;
+ break;
+ case AtomicOrderingCABI::seq_cst:
+ Info.order = AtomicOrdering::SequentiallyConsistent;
+ break;
+ default:
+ Info.order = AtomicOrdering::Monotonic;
+ break;
+ }
+
+ Info.flags =
+ (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore);
+ Info.flags |= MOCooperative;
+
+ MDNode *ScopeMD = cast<MDNode>(
+ cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
+ StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
+ Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1506,6 +1545,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_load_monitor_b32:
case Intrinsic::amdgcn_global_load_monitor_b64:
case Intrinsic::amdgcn_global_load_monitor_b128:
+ case Intrinsic::amdgcn_cluster_load_b32:
+ case Intrinsic::amdgcn_cluster_load_b64:
+ case Intrinsic::amdgcn_cluster_load_b128:
case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1525,6 +1567,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOLoad;
return true;
}
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
+ return true;
+ }
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(0);
+ Info.align.reset();
+ getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1553,7 +1615,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
- case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
Info.ptrVal = CI.getArgOperand(1);
@@ -1636,6 +1702,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
Value *Ptr = nullptr;
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_cluster_load_b128:
+ case Intrinsic::amdgcn_cluster_load_b64:
+ case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1678,6 +1747,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
Ptr = II->getArgOperand(1);
break;
default:
@@ -4260,6 +4333,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
break;
}
+ // If the caller is a whole wave function, we need to use a special opcode
+ // so we can patch up EXEC.
+ if (Info->isWholeWaveFunction())
+ OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
+
return DAG.getNode(OPC, DL, MVT::Other, Ops);
}
@@ -5192,7 +5270,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
-static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
+static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
+ MachineBasicBlock *BB) {
+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
+ // For GFX12, we emit s_add_u64 and s_sub_u64.
+ MachineFunction *MF = BB->getParent();
+ const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(2);
+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+ if (ST.hasScalarAddSub64()) {
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
+ // clang-format off
+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
+ .add(Src0)
+ .add(Src1);
+ // clang-format on
+ } else {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ }
+ MI.eraseFromParent();
+ return BB;
+}
+
+static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
switch (Opc) {
case AMDGPU::S_MIN_U32:
return std::numeric_limits<uint32_t>::max();
@@ -5210,10 +5339,42 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
default:
- llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
+ llvm_unreachable(
+ "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
}
}
+static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
+ return std::numeric_limits<uint64_t>::max();
+ case AMDGPU::V_CMP_LT_I64_e64: // min.i64
+ return std::numeric_limits<int64_t>::max();
+ case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
+ return std::numeric_limits<uint64_t>::min();
+ case AMDGPU::V_CMP_GT_I64_e64: // max.i64
+ return std::numeric_limits<int64_t>::min();
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_XOR_B64:
+ return std::numeric_limits<uint64_t>::min();
+ case AMDGPU::S_AND_B64:
+ return std::numeric_limits<uint64_t>::max();
+ default:
+ llvm_unreachable(
+ "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
+ }
+}
+
+static bool is32bitWaveReduceOperation(unsigned Opc) {
+ return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
+ Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
+ Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
+ Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
+ Opc == AMDGPU::S_XOR_B32;
+}
+
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock &BB,
const GCNSubtarget &ST,
@@ -5241,53 +5402,99 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
RetBB = &BB;
break;
}
+ case AMDGPU::V_CMP_LT_U64_e64: // umin
+ case AMDGPU::V_CMP_LT_I64_e64: // min
+ case AMDGPU::V_CMP_GT_U64_e64: // umax
+ case AMDGPU::V_CMP_GT_I64_e64: // max
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64: {
+ // Idempotent operations.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
+ RetBB = &BB;
+ break;
+ }
case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
- case AMDGPU::S_SUB_I32: {
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+ Register NumActiveLanes =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- unsigned CountReg =
+ unsigned BitCountOpc =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
- auto Exec =
- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
- .addReg(Exec->getOperand(0).getReg());
+ auto NewAccumulator =
+ BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
+ .addReg(ExecMask);
switch (Opc) {
- case AMDGPU::S_XOR_B32: {
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_XOR_B64: {
// Performing an XOR operation on a uniform value
// depends on the parity of the number of active lanes.
// For even parity, the result will be 0, for odd
// parity the result will be the same as the input value.
- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
-
- auto ParityReg =
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
- .addReg(NewAccumulator->getOperand(0).getReg())
- .addImm(1);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(SrcReg)
- .addReg(ParityReg->getOperand(0).getReg());
+ Register ParityRegister =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .addImm(1)
+ .setOperandDead(3); // Dead scc
+ if (Opc == AMDGPU::S_XOR_B32) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(ParityRegister);
+ } else {
+ Register DestSub0 =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *SrcSubRC =
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+ .add(Op1L)
+ .addReg(ParityRegister);
+
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
+ .add(Op1H)
+ .addReg(ParityRegister);
+
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ }
break;
}
case AMDGPU::S_SUB_I32: {
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
// Take the negation of the source operand.
- auto InvertedValReg =
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
- .addImm(-1)
- .addReg(SrcReg);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
+ .addImm(0)
+ .addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(InvertedValReg->getOperand(0).getReg())
+ .addReg(NegatedVal)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
@@ -5297,6 +5504,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Op1H_Op0L_Reg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Op1L_Op0H_Reg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NegatedValLo =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NegatedValHi =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *Src1SubRC =
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
+
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
+
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
+ .addImm(0)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .setOperandDead(3); // Dead scc
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
+ .addReg(NegatedValLo)
+ .addImm(31)
+ .setOperandDead(3); // Dead scc
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
+ .add(Op1L)
+ .addReg(NegatedValHi);
+ }
+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
+ ? NegatedValLo
+ : NewAccumulator->getOperand(0).getReg();
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+ .add(Op1L)
+ .addReg(LowOpcode);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
+ .add(Op1L)
+ .addReg(LowOpcode);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
+ .add(Op1H)
+ .addReg(LowOpcode);
+
+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
+ .addReg(CarryReg)
+ .addReg(Op1H_Op0L_Reg)
+ .setOperandDead(3); // Dead scc
+
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
+ .addReg(HiVal)
+ .addReg(Op1L_Op0H_Reg)
+ .setOperandDead(3); // Dead scc
+ }
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ break;
+ }
}
RetBB = &BB;
}
@@ -5313,6 +5589,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// so that we will get the next active lane for next iteration.
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
+ bool is32BitOpc = is32bitWaveReduceOperation(Opc);
// Create Control flow for loop
// Split MI's Machine Basic block into For loop
@@ -5322,73 +5599,160 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
-
+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-
- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
- Register LaneValueReg =
- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
bool IsWave32 = ST.isWave32();
- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Create initial values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
- uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
- auto TmpSReg =
- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
- .addImm(InitalValue);
+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
+ if (is32BitOpc) {
+ uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
+ .addImm(IdentityValue);
+ } else {
+ uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
+ .addImm(IdentityValue);
+ }
// clang-format off
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(ComputeLoop);
// clang-format on
// Start constructing ComputeLoop
- I = ComputeLoop->end();
+ I = ComputeLoop->begin();
auto Accumulator =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
- .addReg(InitalValReg)
+ .addReg(IdentityValReg)
.addMBB(&BB);
auto ActiveBits =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
- .addReg(TmpSReg->getOperand(0).getReg())
+ .addReg(LoopIterator)
.addMBB(&BB);
+ I = ComputeLoop->end();
+ MachineInstr *NewAccumulator;
// Perform the computations
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
- .addReg(ActiveBits->getOperand(0).getReg());
- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
- .addReg(SrcReg)
- .addReg(FF1->getOperand(0).getReg());
- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValue->getOperand(0).getReg());
-
+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+ .addReg(ActiveBitsReg);
+ if (is32BitOpc) {
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueReg)
+ .addReg(SrcReg)
+ .addReg(FF1Reg);
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValueReg);
+ } else {
+ Register LaneValueLoReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValueHiReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *SrcSubRC =
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+ // lane value input should be in an sgpr
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueLoReg)
+ .add(Op1L)
+ .addReg(FF1Reg);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueHiReg)
+ .add(Op1H)
+ .addReg(FF1Reg);
+ auto LaneValue = BuildMI(*ComputeLoop, I, DL,
+ TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
+ .addReg(LaneValueLoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(LaneValueHiReg)
+ .addImm(AMDGPU::sub1);
+ switch (Opc) {
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_XOR_B64: {
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValue->getOperand(0).getReg())
+ .setOperandDead(3); // Dead scc
+ break;
+ }
+ case AMDGPU::V_CMP_GT_I64_e64:
+ case AMDGPU::V_CMP_GT_U64_e64:
+ case AMDGPU::V_CMP_LT_I64_e64:
+ case AMDGPU::V_CMP_LT_U64_e64: {
+ Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register ComparisonResultReg =
+ MRI.createVirtualRegister(WaveMaskRegClass);
+ const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
+ const TargetRegisterClass *VSubRegClass =
+ TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
+ Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
+ MachineOperand SrcReg0Sub0 =
+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
+ VregClass, AMDGPU::sub0, VSubRegClass);
+ MachineOperand SrcReg0Sub1 =
+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
+ VregClass, AMDGPU::sub1, VSubRegClass);
+ BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
+ AccumulatorVReg)
+ .add(SrcReg0Sub0)
+ .addImm(AMDGPU::sub0)
+ .add(SrcReg0Sub1)
+ .addImm(AMDGPU::sub1);
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
+ .addReg(LaneValue->getOperand(0).getReg())
+ .addReg(AccumulatorVReg);
+
+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
+ .addReg(LaneMaskReg)
+ .addReg(ActiveBitsReg);
+
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::S_CSELECT_B64), DstReg)
+ .addReg(LaneValue->getOperand(0).getReg())
+ .addReg(Accumulator->getOperand(0).getReg());
+ break;
+ }
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValue->getOperand(0).getReg());
+ ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
+ break;
+ }
+ }
+ }
// Manipulate the iterator to get the next active lane
unsigned BITSETOpc =
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
- auto NewActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
- .addReg(FF1->getOperand(0).getReg())
- .addReg(ActiveBits->getOperand(0).getReg());
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+ .addReg(FF1Reg)
+ .addReg(ActiveBitsReg);
// Add phi nodes
- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
- .addMBB(ComputeLoop);
- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
- .addMBB(ComputeLoop);
+ Accumulator.addReg(DstReg).addMBB(ComputeLoop);
+ ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
// Creating branching
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
- .addReg(NewActiveBits->getOperand(0).getReg())
+ .addReg(NewActiveBitsReg)
.addImm(0);
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
.addMBB(ComputeLoop);
@@ -5410,22 +5774,40 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
@@ -5452,55 +5834,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
- // For targets older than GFX12, we emit a sequence of 32-bit operations.
- // For GFX12, we emit s_add_u64 and s_sub_u64.
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- const DebugLoc &DL = MI.getDebugLoc();
- MachineOperand &Dest = MI.getOperand(0);
- MachineOperand &Src0 = MI.getOperand(1);
- MachineOperand &Src1 = MI.getOperand(2);
- bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
- if (Subtarget->hasScalarAddSub64()) {
- unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
- // clang-format off
- BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
- .add(Src0)
- .add(Src1);
- // clang-format on
- } else {
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
-
- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-
- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
- .add(Src0Sub0)
- .add(Src1Sub0);
- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
- .add(Src0Sub1)
- .add(Src1Sub1);
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
- .addReg(DestSub0)
- .addImm(AMDGPU::sub0)
- .addReg(DestSub1)
- .addImm(AMDGPU::sub1);
- }
- MI.eraseFromParent();
- return BB;
+ return Expand64BitScalarArithmetic(MI, BB);
}
case AMDGPU::V_ADD_U64_PSEUDO:
case AMDGPU::V_SUB_U64_PSEUDO: {
@@ -6023,14 +6357,15 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return SplitBB;
}
+ case AMDGPU::SI_TCRETURN_GFX_WholeWave:
case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
assert(MFI->isWholeWaveFunction());
// During ISel, it's difficult to propagate the original EXEC mask to use as
// an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
- Register OriginalExec = Setup->getOperand(0).getReg();
assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
+ Register OriginalExec = Setup->getOperand(0).getReg();
MF->getRegInfo().clearKillFlags(OriginalExec);
MI.getOperand(0).setReg(OriginalExec);
return BB;
@@ -10246,6 +10581,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+ MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue Ptr = Op->getOperand(2);
+ EVT VT = Op->getValueType(0);
+ return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
+ Chain, Ptr, MII->getMemOperand());
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -10421,41 +10766,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
- case Intrinsic::amdgcn_s_barrier:
- case Intrinsic::amdgcn_s_barrier_signal:
- case Intrinsic::amdgcn_s_barrier_wait: {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
- unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
- if (WGSize <= ST.getWavefrontSize()) {
- // If the workgroup fits in a wave, remove s_barrier_signal and lower
- // s_barrier/s_barrier_wait to wave_barrier.
- if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
- return Op.getOperand(0);
- else
- return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
- MVT::Other, Op.getOperand(0)),
- 0);
- }
- }
-
- if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- SDValue K =
- DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
- SDValue BarSignal =
- SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
- MVT::Other, K, Op.getOperand(0)),
- 0);
- SDValue BarWait =
- SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
- BarSignal.getValue(0)),
- 0);
- return BarWait;
- }
-
- return SDValue();
- };
case Intrinsic::amdgcn_struct_tbuffer_store:
case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
@@ -10913,6 +11223,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+ MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue Ptr = Op->getOperand(2);
+ SDValue Val = Op->getOperand(3);
+ return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
+ Ptr, MII->getMemOperand());
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -16933,10 +17253,12 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
switch (BitWidth) {
case 16:
RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
- : &AMDGPU::VGPR_32RegClass;
+ : &AMDGPU::VGPR_32_Lo256RegClass;
break;
default:
- RC = TRI->getVGPRClassForBitWidth(BitWidth);
+ RC = Subtarget->has1024AddressableVGPRs()
+ ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
+ : TRI->getVGPRClassForBitWidth(BitWidth);
if (!RC)
return std::pair(0U, nullptr);
break;
@@ -16980,7 +17302,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
if (Kind != '\0') {
if (Kind == 'v') {
- RC = &AMDGPU::VGPR_32RegClass;
+ RC = &AMDGPU::VGPR_32_Lo256RegClass;
} else if (Kind == 's') {
RC = &AMDGPU::SGPR_32RegClass;
} else if (Kind == 'a') {
@@ -17022,6 +17344,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
return std::pair(0U, nullptr);
if (Idx < RC->getNumRegs())
return std::pair(RC->getRegister(Idx), RC);
+ return std::pair(0U, nullptr);
}
}
@@ -17808,11 +18131,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
!AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
}
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+ // For GAS, lower to flat atomic.
+ return STI.hasGloballyAddressableScratch()
+ ? TargetLowering::AtomicExpansionKind::CustomExpand
+ : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
// 64-bit flat atomics that dynamically reside in private memory will silently
// be dropped.
@@ -17823,7 +18154,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (AS == AMDGPUAS::FLAT_ADDRESS &&
DL.getTypeSizeInBits(RMW->getType()) == 64 &&
flatInstrMayAccessPrivate(RMW))
- return AtomicExpansionKind::Expand;
+ return AtomicExpansionKind::CustomExpand;
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
OptimizationRemarkEmitter ORE(RMW->getFunction());
@@ -17898,7 +18229,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// does. InstCombine transforms these with 0 to or, so undo that.
if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
ConstVal && ConstVal->isNullValue())
- return AtomicExpansionKind::Expand;
+ return AtomicExpansionKind::CustomExpand;
}
// If the allocation could be in remote, fine-grained memory, the rmw
@@ -18027,9 +18358,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// fadd.
if (Subtarget->hasLDSFPAtomicAddF32()) {
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
- return AtomicExpansionKind::Expand;
+ return AtomicExpansionKind::CustomExpand;
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
- return AtomicExpansionKind::Expand;
+ return AtomicExpansionKind::CustomExpand;
}
}
}
@@ -18083,14 +18414,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
@@ -18098,7 +18429,7 @@ TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
unsigned AddrSpace = CmpX->getPointerAddressSpace();
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
return AtomicExpansionKind::None;
@@ -18109,7 +18440,7 @@ SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
// If a 64-bit flat atomic may alias private, we need to avoid using the
// atomic in the private case.
- return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
: AtomicExpansionKind::None;
}
@@ -18468,9 +18799,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
Builder.CreateBr(ExitBB);
}
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+ unsigned PtrOpIdx) {
+ Value *PtrOp = I->getOperand(PtrOpIdx);
+ assert(PtrOp->getType()->getPointerAddressSpace() ==
+ AMDGPUAS::PRIVATE_ADDRESS);
+
+ Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+ Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+ I->getIterator());
+ I->setOperand(PtrOpIdx, ASCast);
+}
+
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
AtomicRMWInst::BinOp Op = AI->getOperation();
+ if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
Op == AtomicRMWInst::Xor) {
if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18493,9 +18839,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
}
void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
emitExpandAtomicAddrSpacePredicate(CI);
}
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+ if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+ if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
LoadInst *
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae17077..728c6490bdfd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -14,8 +14,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
-#include "AMDGPUISelLowering.h"
#include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUISelLowering.h"
#include "llvm/CodeGen/MachineFunction.h"
namespace llvm {
@@ -562,6 +562,8 @@ public:
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+ void emitExpandAtomicLoad(LoadInst *LI) const override;
+ void emitExpandAtomicStore(StoreInst *SI) const override;
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index e3a2efdd3856..b163a274396f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -152,7 +152,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
- SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets.
+ SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets.
AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets.
// Artificial register slots to track LDS writes into specific LDS locations
@@ -831,7 +831,6 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
unsigned RegIdx = TRI->getHWRegIndex(MCReg);
- assert(isUInt<8>(RegIdx));
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
unsigned Size = TRI->getRegSizeInBits(*RC);
@@ -839,7 +838,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
// AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
if (TRI->isVectorRegister(*MRI, Op.getReg())) {
unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
- assert(Reg < AGPR_OFFSET);
+ assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET);
Result.first = Reg;
if (TRI->isAGPR(*MRI, Op.getReg()))
Result.first += AGPR_OFFSET;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 69708c47f6c9..398c99b3bd12 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -62,8 +62,8 @@ static cl::opt<bool> Fix16BitCopies(
cl::ReallyHidden);
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
- : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
- RI(ST), ST(ST) {
+ : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ RI(ST), ST(ST) {
SchedModel.init(&ST);
}
@@ -2493,7 +2493,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
- case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -3444,12 +3443,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
case AMDGPU::V_ACCVGPR_READ_B32_e64:
case AMDGPU::V_ACCVGPR_MOV_B32:
case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
- return true;
case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
- // TODO: We could fold this, but it's a strange case. The immediate value
- // can't be directly folded into any real use. We would have to spread new
- // immediate legality checks around and only accept subregister extracts for
- // profitability.
+ return true;
default:
return false;
}
@@ -3559,13 +3554,12 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Register Reg, MachineRegisterInfo *MRI) const {
- if (!MRI->hasOneNonDBGUse(Reg))
- return false;
-
int64_t Imm;
if (!getConstValDefinedInReg(DefMI, Reg, Imm))
return false;
+ const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
+
assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
unsigned Opc = UseMI.getOpcode();
@@ -3577,6 +3571,25 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
+ if (HasMultipleUses) {
+ // TODO: This should fold in more cases with multiple use, but we need to
+ // more carefully consider what those uses are.
+ unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
+
+ // Avoid breaking up a 64-bit inline immediate into a subregister extract.
+ if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
+ return false;
+
+ // Most of the time folding a 32-bit inline constant is free (though this
+ // might not be true if we can't later fold it into a real user).
+ //
+ // FIXME: This isInlineConstant check is imprecise if
+ // getConstValDefinedInReg handled the tricky non-mov cases.
+ if (ImmDefSize == 32 &&
+ !isInlineConstant(Imm, AMDGPU::OPERAND_REG_IMM_INT32))
+ return false;
+ }
+
bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
RI.getSubRegIdxSize(UseSubReg) == 16;
@@ -3664,6 +3677,9 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
return true;
}
+ if (HasMultipleUses)
+ return false;
+
if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
@@ -4572,34 +4588,43 @@ static bool compareMachineOp(const MachineOperand &Op0,
}
}
-bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
- const MachineOperand &MO) const {
- const MCInstrDesc &InstDesc = MI.getDesc();
- const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
-
- assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
-
+bool SIInstrInfo::isLiteralOperandLegal(const MCInstrDesc &InstDesc,
+ const MCOperandInfo &OpInfo) const {
if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
return true;
- if (OpInfo.RegClass < 0)
+ if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
return false;
- if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
- if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
- OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::src2))
+ if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
+ return true;
+
+ return ST.hasVOP3Literal();
+}
+
+bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
+ int64_t ImmVal) const {
+ const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
+ if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
+ if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
+ OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
+ AMDGPU::OpName::src2))
return false;
return RI.opCanUseInlineConstant(OpInfo.OperandType);
}
- if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
- return false;
+ return isLiteralOperandLegal(InstDesc, OpInfo);
+}
- if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
- return true;
+bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
+ const MachineOperand &MO) const {
+ if (MO.isImm())
+ return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
- return ST.hasVOP3Literal();
+ assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
+ "unexpected imm-like operand kind");
+ const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
+ return isLiteralOperandLegal(InstDesc, OpInfo);
}
bool SIInstrInfo::isLegalAV64PseudoImm(uint64_t Imm) const {
@@ -4759,6 +4784,31 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
return Inst32;
}
+bool SIInstrInfo::physRegUsesConstantBus(const MachineOperand &RegOp) const {
+ // Null is free
+ Register Reg = RegOp.getReg();
+ if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
+ return false;
+
+ // SGPRs use the constant bus
+
+ // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
+ // physical register operands should also count, except for exec.
+ if (RegOp.isImplicit())
+ return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
+
+ // SGPRs use the constant bus
+ return AMDGPU::SReg_32RegClass.contains(Reg) ||
+ AMDGPU::SReg_64RegClass.contains(Reg);
+}
+
+bool SIInstrInfo::regUsesConstantBus(const MachineOperand &RegOp,
+ const MachineRegisterInfo &MRI) const {
+ Register Reg = RegOp.getReg();
+ return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
+ : physRegUsesConstantBus(RegOp);
+}
+
bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
const MachineOperand &MO,
const MCOperandInfo &OpInfo) const {
@@ -4766,23 +4816,9 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
if (!MO.isReg())
return !isInlineConstant(MO, OpInfo);
- if (!MO.isUse())
- return false;
-
- if (MO.getReg().isVirtual())
- return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
-
- // Null is free
- if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
- return false;
-
- // SGPRs use the constant bus
- if (MO.isImplicit()) {
- return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
- MO.getReg() == AMDGPU::VCC_LO;
- }
- return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
- AMDGPU::SReg_64RegClass.contains(MO.getReg());
+ Register Reg = MO.getReg();
+ return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
+ : physRegUsesConstantBus(MO);
}
static Register findImplicitSGPRRead(const MachineInstr &MI) {
@@ -4933,7 +4969,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
int RegClass = Desc.operands()[i].RegClass;
- switch (Desc.operands()[i].OperandType) {
+ const MCOperandInfo &OpInfo = Desc.operands()[i];
+ switch (OpInfo.OperandType) {
case MCOI::OPERAND_REGISTER:
if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
ErrInfo = "Illegal immediate value for operand.";
@@ -4941,15 +4978,31 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
break;
case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_BF16:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2BF16:
+ break;
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ break;
break;
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
- case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
- case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_BF16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
@@ -4965,6 +5018,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return false;
}
break;
+ case AMDGPU::OPERAND_INPUT_MODS:
+ case AMDGPU::OPERAND_SDWA_VOPC_DST:
+ case AMDGPU::OPERAND_KIMM16:
+ break;
case MCOI::OPERAND_IMMEDIATE:
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM64:
@@ -4976,9 +5033,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
ErrInfo = "Expected immediate, but got non-immediate";
return false;
}
- [[fallthrough]];
+ break;
+ case MCOI::OPERAND_UNKNOWN:
+ case MCOI::OPERAND_MEMORY:
+ case MCOI::OPERAND_PCREL:
+ break;
default:
- continue;
+ if (OpInfo.isGenericType())
+ continue;
+ break;
}
if (!MO.isReg())
@@ -4991,7 +5054,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// aligned register constraint.
// FIXME: We do not verify inline asm operands, but custom inline asm
// verification is broken anyway
- if (ST.needsAlignedVGPRs()) {
+ if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
if (const TargetRegisterClass *SubRC =
@@ -5912,13 +5975,12 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
static const TargetRegisterClass *
adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
- const MachineRegisterInfo &MRI,
const MCInstrDesc &TID, unsigned RCID,
bool IsAllocatable) {
- if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
+ if ((IsAllocatable || !ST.hasGFX90AInsts()) &&
(((TID.mayLoad() || TID.mayStore()) &&
!(TID.TSFlags & SIInstrFlags::Spill)) ||
- (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
+ (TID.TSFlags & SIInstrFlags::MIMG))) {
switch (RCID) {
case AMDGPU::AV_32RegClassID:
RCID = AMDGPU::VGPR_32RegClassID;
@@ -5953,44 +6015,31 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
if (OpNum >= TID.getNumOperands())
return nullptr;
auto RegClass = TID.operands()[OpNum].RegClass;
- bool IsAllocatable = false;
- if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
- // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
- // with two data operands. Request register class constrained to VGPR only
- // of both operands present as Machine Copy Propagation can not check this
- // constraint and possibly other passes too.
- //
- // The check is limited to FLAT and DS because atomics in non-flat encoding
- // have their vdst and vdata tied to be the same register.
- const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
- AMDGPU::OpName::vdst);
- const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
- (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
- : AMDGPU::OpName::vdata);
- if (DataIdx != -1) {
- IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
- TID.Opcode, AMDGPU::OpName::data1);
- }
+ if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
+ // Special pseudos have no alignment requirement
+ return RI.getRegClass(RegClass);
}
- return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
- IsAllocatable);
+
+ return adjustAllocatableRegClass(ST, RI, TID, RegClass, false);
}
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
unsigned OpNo) const {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
const MCInstrDesc &Desc = get(MI.getOpcode());
if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
Desc.operands()[OpNo].RegClass == -1) {
Register Reg = MI.getOperand(OpNo).getReg();
- if (Reg.isVirtual())
+ if (Reg.isVirtual()) {
+ const MachineRegisterInfo &MRI =
+ MI.getParent()->getParent()->getRegInfo();
return MRI.getRegClass(Reg);
+ }
return RI.getPhysRegBaseClass(Reg);
}
unsigned RCID = Desc.operands()[OpNo].RegClass;
- return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
+ return adjustAllocatableRegClass(ST, RI, Desc, RCID, true);
}
void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -6224,15 +6273,14 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
continue;
const MachineOperand &Op = MI.getOperand(i);
if (Op.isReg()) {
- RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
- if (!SGPRsUsed.count(SGPR) &&
- // FIXME: This can access off the end of the operands() array.
- usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
- if (--ConstantBusLimit <= 0)
- return false;
- SGPRsUsed.insert(SGPR);
+ if (Op.isUse()) {
+ RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
+ if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
+ if (--ConstantBusLimit <= 0)
+ return false;
+ }
}
- } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
+ } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
!isInlineConstant(Op, InstDesc.operands()[i])) {
// The same literal may be used multiple times.
if (!UsedLiteral)
@@ -6526,6 +6574,21 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
!RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
legalizeOpWithMove(MI, VOP3Idx[2]);
+ if (isWMMA(MI)) {
+ // scale_src has a register class restricted to low 256 VGPRs, we may need
+ // to insert a copy to the restricted VGPR class.
+ int ScaleSrc0Idx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0);
+ if (ScaleSrc0Idx != -1) {
+ int ScaleSrc1Idx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1);
+ if (!isOperandLegal(MI, ScaleSrc0Idx))
+ legalizeOpWithMove(MI, ScaleSrc0Idx);
+ if (!isOperandLegal(MI, ScaleSrc1Idx))
+ legalizeOpWithMove(MI, ScaleSrc1Idx);
+ }
+ }
+
// Fix the register class of packed FP32 instructions on gfx12+. See
// SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {
@@ -8036,12 +8099,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
MRI.replaceRegWith(DstReg, NewDstReg);
MRI.clearKillFlags(NewDstReg);
Inst.getOperand(0).setReg(DstReg);
- // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
- // these are deleted later, but at -O0 it would leave a suspicious
- // looking illegal copy of an undef register.
- for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
- Inst.removeOperand(I);
- Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
+ Inst.eraseFromParent();
// Legalize t16 operand since replaceReg is called after addUsersToVALU
for (MachineOperand &MO :
make_early_inc_range(MRI.use_operands(NewDstReg))) {
@@ -9235,6 +9293,9 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
AMDGPU::OpName OperandName) const {
+ if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
+ return nullptr;
+
int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
if (Idx == -1)
return nullptr;
@@ -9532,6 +9593,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
{
{MONoClobber, "amdgpu-noclobber"},
{MOLastUse, "amdgpu-last-use"},
+ {MOCooperative, "amdgpu-cooperative"},
};
return ArrayRef(TargetFlags);
@@ -10219,7 +10281,7 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
InstructionUniformity
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
- unsigned opcode = MI.getOpcode();
+ unsigned Opcode = MI.getOpcode();
auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
@@ -10239,7 +10301,7 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
// If the target supports globally addressable scratch, the mapping from
// scratch memory to the flat aperture changes therefore an address space cast
// is no longer uniform.
- if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
+ if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
return HandleAddrSpaceCast(MI);
if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
@@ -10267,7 +10329,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
//
// All other loads are not divergent, because if threads issue loads with the
// same arguments, they will always get the same result.
- if (opcode == AMDGPU::G_LOAD) {
+ if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
+ Opcode == AMDGPU::G_SEXTLOAD) {
if (MI.memoperands_empty())
return InstructionUniformity::NeverUniform; // conservative assumption
@@ -10281,10 +10344,10 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
- if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
- opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
- opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
- AMDGPU::isGenericAtomic(opcode)) {
+ if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
+ Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
+ Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
+ AMDGPU::isGenericAtomic(Opcode)) {
return InstructionUniformity::NeverUniform;
}
return InstructionUniformity::Default;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fdbd9ce4a66b..f7dde2b90b68 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -48,6 +48,10 @@ static const MachineMemOperand::Flags MONoClobber =
static const MachineMemOperand::Flags MOLastUse =
MachineMemOperand::MOTargetFlag2;
+/// Mark the MMO of cooperative load/store atomics.
+static const MachineMemOperand::Flags MOCooperative =
+ MachineMemOperand::MOTargetFlag3;
+
/// Utility to store machine instructions worklist.
struct SIInstrWorklist {
SIInstrWorklist() = default;
@@ -533,13 +537,13 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::VOP2;
}
- static bool isVOP3(const MachineInstr &MI) {
- return MI.getDesc().TSFlags & SIInstrFlags::VOP3;
+ static bool isVOP3(const MCInstrDesc &Desc) {
+ return Desc.TSFlags & SIInstrFlags::VOP3;
}
- bool isVOP3(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::VOP3;
- }
+ static bool isVOP3(const MachineInstr &MI) { return isVOP3(MI.getDesc()); }
+
+ bool isVOP3(uint16_t Opcode) const { return isVOP3(get(Opcode)); }
static bool isSDWA(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::SDWA;
@@ -841,13 +845,13 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
}
- static bool isMAI(const MachineInstr &MI) {
- return MI.getDesc().TSFlags & SIInstrFlags::IsMAI;
+ static bool isMAI(const MCInstrDesc &Desc) {
+ return Desc.TSFlags & SIInstrFlags::IsMAI;
}
- bool isMAI(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::IsMAI;
- }
+ static bool isMAI(const MachineInstr &MI) { return isMAI(MI.getDesc()); }
+
+ bool isMAI(uint16_t Opcode) const { return isMAI(get(Opcode)); }
static bool isMFMA(const MachineInstr &MI) {
return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
@@ -983,13 +987,19 @@ public:
return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
}
- bool isBarrier(unsigned Opcode) const {
+ // Check to see if opcode is for a barrier start. Pre gfx12 this is just the
+ // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
+ // to check for the barrier start (S_BARRIER_SIGNAL*)
+ bool isBarrierStart(unsigned Opcode) const {
return Opcode == AMDGPU::S_BARRIER ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
- Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
- Opcode == AMDGPU::S_BARRIER_WAIT ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
+ }
+
+ bool isBarrier(unsigned Opcode) const {
+ return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
@@ -1045,6 +1055,8 @@ public:
return AMDGPU::S_WAIT_DSCNT;
case AMDGPU::S_WAIT_KMCNT_soft:
return AMDGPU::S_WAIT_KMCNT;
+ case AMDGPU::S_WAIT_XCNT_soft:
+ return AMDGPU::S_WAIT_XCNT;
default:
return Opcode;
}
@@ -1174,9 +1186,20 @@ public:
return isInlineConstant(*MO.getParent(), MO.getOperandNo());
}
- bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+ bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
const MachineOperand &MO) const;
+ bool isLiteralOperandLegal(const MCInstrDesc &InstDesc,
+ const MCOperandInfo &OpInfo) const;
+
+ bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
+ int64_t ImmVal) const;
+
+ bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+ const MachineOperand &MO) const {
+ return isImmOperandLegal(MI.getDesc(), OpNo, MO);
+ }
+
/// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isLegalAV64PseudoImm(uint64_t Imm) const;
@@ -1184,6 +1207,10 @@ public:
/// This function will return false if you pass it a 32-bit instruction.
bool hasVALU32BitEncoding(unsigned Opcode) const;
+ bool physRegUsesConstantBus(const MachineOperand &Reg) const;
+ bool regUsesConstantBus(const MachineOperand &Reg,
+ const MachineRegisterInfo &MRI) const;
+
/// Returns true if this operand uses the constant bus.
bool usesConstantBus(const MachineRegisterInfo &MRI,
const MachineOperand &MO,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 0374526e35c4..aa5dae09ca18 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1806,15 +1806,15 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
VOPDstOperand_t16Lo128),
VOPDstOperand<VGPR_32>);
RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>,
- !eq(VT.Size, 512) : VOPDstOperand<VReg_512>,
- !eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
- !eq(VT.Size, 192) : VOPDstOperand<VReg_192>,
- !eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
+ !eq(VT.Size, 512) : VOPDstOperand<VReg_512>,
+ !eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
+ !eq(VT.Size, 192) : VOPDstOperand<VReg_192>,
+ !eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
!eq(VT.Size, 96) : VOPDstOperand<VReg_96>,
- !eq(VT.Size, 64) : VOPDstOperand<VReg_64>,
- !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>,
- !eq(VT.Size, 16) : op16,
- 1 : VOPDstS64orS32); // else VT == i1
+ !eq(VT.Size, 64) : VOPDstOperand<VReg_64>,
+ !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>,
+ !eq(VT.Size, 16) : op16,
+ 1 : VOPDstS64orS32); // else VT == i1
}
class getVALUDstForVT_fake16<ValueType VT> {
@@ -1898,7 +1898,7 @@ class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> {
!eq(VT.Size, 64) : RegisterOperand<VReg_64>,
!eq(VT.Size, 48) : RegisterOperand<VReg_64>,
!eq(VT.Size, 16) : !if(IsTrue16,
- !if(IsFake16, VGPRSrc_32_Lo128, VGPRSrc_16_Lo128),
+ !if(IsFake16, VGPROp_32_Lo128, VGPROp_16_Lo128),
RegisterOperand<VGPR_32>),
1 : RegisterOperand<VGPR_32>);
}
@@ -1950,6 +1950,20 @@ class getVOP3VRegSrcForVT<ValueType VT> {
1 : VRegSrc_32);
}
+// VGPR only VOP3 src with 8 bit encoding e.g. VOP3DPP src0.
+class getVGPRSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
+ RegisterOperand ret =
+ !cond(!eq(VT.Size, 128) : VGPROp_128,
+ !eq(VT.Size, 96) : VGPROp_96,
+ !eq(VT.Size, 64) : VGPROp_64,
+ !eq(VT.Size, 48) : VGPROp_64,
+ !eq(VT.Size, 16) : !if(IsTrue16,
+ !if(IsFake16, VGPROp_32,
+ VGPROp_16),
+ VGPROp_32),
+ 1 : VGPROp_32);
+}
+
// Src2 of VOP3 DPP instructions cannot be a literal
class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
RegisterOperand ret =
@@ -2578,22 +2592,50 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
}
-// Return an AGPR+VGPR operand class for the given VGPR register class.
-class getLdStRegisterOperand<RegisterClass RC> {
- // This type of operands is only used in pseudo instructions helping
- // code generation and thus doesn't need encoding and decoding methods.
- // It also doesn't need to support AGPRs, because GFX908/A/40 do not
- // support True16.
- defvar VLdSt_16 = RegisterOperand<VGPR_16>;
+class getAlign2RegOp<RegisterOperand RC> {
+ RegisterOperand ret =
+ !cond(!eq(RC, VGPROp_16) : VGPROp_16,
+ !eq(RC, VGPROp_32) : VGPROp_32,
+ !eq(RC, VGPROp_64) : VGPROp_64_Align2,
+ !eq(RC, VGPROp_64_Align1) : VGPROp_64_Align2,
+ !eq(RC, VGPROp_96) : VGPROp_96_Align2,
+ !eq(RC, VGPROp_96_Align1) : VGPROp_96_Align2,
+ !eq(RC, VGPROp_128) : VGPROp_128_Align2,
+ !eq(RC, VGPROp_128_Align1) : VGPROp_128_Align2,
+ !eq(RC, VGPROp_160) : VGPROp_160_Align2,
+ !eq(RC, VGPROp_160_Align1) : VGPROp_160_Align2,
+ !eq(RC, VGPROp_1024) : VGPROp_1024_Align2,
+ !eq(RC, VGPROp_1024_Align1) : VGPROp_1024_Align2,
+ !eq(RC, AVLdSt_32) : AVLdSt_32,
+ !eq(RC, AVLdSt_64) : AVLdSt_64_Align2,
+ !eq(RC, AVLdSt_96) : AVLdSt_96_Align2,
+ !eq(RC, AVLdSt_96_Align1) : AVLdSt_96_Align2,
+ !eq(RC, AVLdSt_128) : AVLdSt_128_Align2,
+ !eq(RC, AVLdSt_128_Align1) : AVLdSt_128_Align2,
+ !eq(RC, AVLdSt_160) : AVLdSt_160_Align2,
+ !eq(RC, AVLdSt_160_Align1) : AVLdSt_160_Align2);
+}
+
+class getEquivalentAGPROperand<RegisterOperand RC> {
+ defvar Size = RC.RegClass.Size;
+ RegisterOperand ret =
+ !cond(!eq(Size, 32) : RegisterOperand<AGPR_32>,
+ !eq(Size, 64) : RegisterOperand<AReg_64>,
+ !eq(Size, 96) : RegisterOperand<AReg_96>,
+ !eq(Size, 128) : RegisterOperand<AReg_128>,
+ !eq(Size, 160) : RegisterOperand<AReg_160>,
+ !eq(Size, 1024) : RegisterOperand<AReg_1024>);
+}
+class getEquivalentVGPROperand<RegisterOperand RC> {
+ defvar Size = RC.RegClass.Size;
RegisterOperand ret =
- !cond(!eq(RC.Size, 16) : VLdSt_16,
- !eq(RC.Size, 32) : AVLdSt_32,
- !eq(RC.Size, 64) : AVLdSt_64,
- !eq(RC.Size, 96) : AVLdSt_96,
- !eq(RC.Size, 128) : AVLdSt_128,
- !eq(RC.Size, 160) : AVLdSt_160,
- !eq(RC.Size, 1024) : AVLdSt_1024);
+ !cond(!eq(Size, 32) : RegisterOperand<VGPR_32>,
+ !eq(Size, 64) : RegisterOperand<VReg_64>,
+ !eq(Size, 96) : RegisterOperand<VReg_96>,
+ !eq(Size, 128) : RegisterOperand<VReg_128>,
+ !eq(Size, 160) : RegisterOperand<VReg_160>,
+ !eq(Size, 1024) : RegisterOperand<VReg_1024>);
}
class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32,
@@ -2643,7 +2685,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field RegisterOperand Src0DPP = getVregSrcForVT<Src0VT>.ret;
field RegisterOperand Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterOperand Src2DPP = getVregSrcForVT<Src2VT>.ret;
- field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
+ field RegisterOperand Src0VOP3DPP = getVGPRSrcForVT<Src0VT>.ret;
field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
@@ -2859,7 +2901,7 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret;
- let Src0VOP3DPP = !if (!eq(Src0VT.Size, 16), VGPRSrc_16, VGPRSrc_32);
+ let Src0VOP3DPP = !if (!eq(Src0VT.Size, 16), VGPROp_16, VGPROp_32);
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0 /*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0 /*IsFake16*/>.ret;
let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8b450122673..1f7951258c21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -66,7 +66,7 @@ defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
// Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
let OtherPredicates = [isNotGFX90APlus] in {
-let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
+let Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
0x00000001,
@@ -77,7 +77,7 @@ defm V_INTERP_P2_F32 : VINTRP_m <
[(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
(i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
-} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
+} // End Constraints = "$src0 = $vdst"
defm V_INTERP_MOV_F32 : VINTRP_m <
0x00000002,
@@ -326,28 +326,57 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
// clang-format off
-defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
+
multiclass
- AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
+ AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def !toupper(Op) #"_PSEUDO_" #DataType
- : VPseudoInstSI<(outs SGPR_32 : $sdst),
- (ins VSrc_b32 : $src, VSrc_b32 : $strategy),
- [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
+ : VPseudoInstSI<(outs RetReg : $sdst),
+ (ins Reg : $src, VSrc_b32 : $strategy),
+ [(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
}
}
// clang-format on
+class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
+ RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> {
+ string Name = OpName;
+ string TypeString = TypeStr;
+ ValueType VT = Ty;
+ RegisterClass RetReg = ReturnRegisterClass;
+ SrcRegOrImm9 Reg = RC;
+}
+
// Input list : [Operation_name,
-// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
+// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
+// bit-width
+// output register class,
+// input register class]
defvar Operations = [
- ["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
- ["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
- ["xor", "B32"]
+ WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>,
+
+ WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
];
foreach Op = Operations in {
- defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
+ defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString,
+ Op.VT, Op.RetReg, Op.Reg>;
}
let usesCustomInserter = 1, Defs = [VCC] in {
@@ -692,6 +721,33 @@ def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
def : GCNPat<
(AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
+// Restores the previous EXEC and otherwise behaves entirely like a SI_TCRETURN.
+// This is used for tail calls *from* a whole wave function. Tail calls to
+// a whole wave function may use the usual opcodes, depending on the calling
+// convention of the caller.
+def SI_TCRETURN_GFX_WholeWave : SPseudoInstSI <
+ (outs),
+ (ins SReg_1:$orig_exec, Gfx_CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+ let isCall = 1;
+ let isTerminator = 1;
+ let isReturn = 1;
+ let isBarrier = 1;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteBranch];
+ let isConvergent = 1;
+
+ // We're going to use custom handling to set the $orig_exec to the correct value.
+ let usesCustomInserter = 1;
+}
+
+// Generate a SI_TCRETURN_GFX_WholeWave pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+ (AMDGPUtc_return_gfx_ww i64:$src0, tglobaladdr:$callee, i32:$fpdiff),
+ (SI_TCRETURN_GFX_WholeWave (i1 (IMPLICIT_DEF)), Gfx_CCR_SGPR_64:$src0,
+ tglobaladdr:$callee, i32:$fpdiff)>;
+
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -2174,7 +2230,8 @@ def : GCNPat <
}
foreach fp16vt = [f16, bf16] in {
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(fcopysign fp16vt:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -2205,6 +2262,42 @@ def : GCNPat <
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
+}
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (fcopysign fp16vt:$src0, fp16vt:$src1),
+ (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
+>;
+
+def : GCNPat <
+ (fcopysign f32:$src0, fp16vt:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
+>;
+
+def : GCNPat <
+ (fcopysign f64:$src0, fp16vt:$src1),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1)
+>;
+
+def : GCNPat <
+ (fcopysign fp16vt:$src0, f32:$src1),
+ (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16)
+>;
+
+def : GCNPat <
+ (fcopysign fp16vt:$src0, f64:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
+>;
+}
} // End foreach fp16vt = [f16, bf16]
@@ -2480,6 +2573,38 @@ def : AMDGPUPatIgnoreCopies <
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
+// (z & ~x)
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<and> i32:$z, (not_oneuse i32:$x)),
+ (V_BFI_B32_e64 VSrc_b32:$x, (i32 0), VSrc_b32:$z)
+>;
+
+// 64-bit version
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<and> i64:$z, (not_oneuse i64:$x)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 0),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 0),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
+>;
+
+// (y | ~x)
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<or> i32:$y, (not_oneuse i32:$x)),
+ (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, (i32 -1))
+>;
+
+// 64-bit version
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<or> i64:$y, (not_oneuse i64:$x)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), (i32 -1)), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), (i32 -1)), sub1)
+>;
+
// SHA-256 Ch function
// z ^ (x & (y ^ z))
def : AMDGPUPatIgnoreCopies <
@@ -3096,6 +3221,11 @@ def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
+>;
}
let True16Predicate = UseRealTrue16Insts in {
@@ -3106,15 +3236,18 @@ def : GCNPat<
def : GCNPat<
(i64 (DivergentUnaryFrag<zext> i16:$src)),
- (REG_SEQUENCE VReg_64,
- (INSERT_SUBREG (i32 (V_MOV_B32_e32 (i32 0))), VGPR_16:$src, lo16), sub0,
- (S_MOV_B32 (i32 0)), sub1)
+ (REG_SEQUENCE VReg_64, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16, (V_MOV_B32_e32 (i32 0)), sub1)
>;
def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0))
+>;
}
def : GCNPat <
@@ -3143,11 +3276,6 @@ def : GCNPat <
(V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
>;
-def : GCNPat <
- (i1 (DivergentUnaryFrag<trunc> i16:$a)),
- (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
->;
-
def IMMBitSelConst : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
MVT::i32);
@@ -3637,13 +3765,24 @@ def : GCNPat <
>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = p in {
// Take the lower 16 bits from each VGPR_32 and concat them
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
>;
+// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
+// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
+def : GCNPat <
+ (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
+ (Ty !if(!eq(Ty, i16),
+ (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
+ (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
+>;
+}
+
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
@@ -3669,18 +3808,6 @@ def : GCNPat <
(V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b)
>;
-
-// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
-// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
-def : GCNPat <
- (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
- (Ty !if(!eq(Ty, i16),
- (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
- (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
- (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
->;
-
-
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
let True16Predicate = NotHasTrue16BitInsts in {
@@ -3752,7 +3879,8 @@ def : GCNPat <
(v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
>;
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2f16 (scalar_to_vector f16:$src0)),
(COPY $src0)
@@ -3772,6 +3900,29 @@ def : GCNPat <
(v4f16 (scalar_to_vector f16:$src0)),
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (v2f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v2i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v4i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat <
+ (v4f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+}
def : GCNPat <
(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 6f2ea8ad1ff0..69d02e7c2934 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -119,7 +119,7 @@ class SILoadStoreOptimizer {
unsigned DMask;
InstClassEnum InstClass;
unsigned CPol = 0;
- bool IsAGPR;
+ const TargetRegisterClass *DataRC;
bool UseST64;
int AddrIdx[MaxAddressRegs];
const MachineOperand *AddrReg[MaxAddressRegs];
@@ -203,6 +203,7 @@ class SILoadStoreOptimizer {
using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
private:
+ MachineFunction *MF = nullptr;
const GCNSubtarget *STM = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
@@ -245,6 +246,8 @@ private:
unsigned write2Opcode(unsigned EltSize) const;
unsigned write2ST64Opcode(unsigned EltSize) const;
+ unsigned getWrite2Opcode(const CombineInfo &CI) const;
+
MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore);
@@ -846,7 +849,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
if (InstClass == UNKNOWN)
return;
- IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
+ DataRC = LSO.getDataRegClass(*MI);
switch (InstClass) {
case DS_READ:
@@ -1313,6 +1316,50 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
// have already been confirmed to be mergeable.
if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
offsetsCanBeCombined(CI, *STM, Paired, true);
+
+ if (CI.InstClass == DS_WRITE) {
+ // Both data operands must be AGPR or VGPR, so the data registers needs to
+ // be constrained to one or the other. We expect to only emit the VGPR form
+ // here for now.
+ //
+ // FIXME: There is currently a hack in getRegClass to report that the write2
+ // operands are VGPRs. In the future we should have separate agpr
+ // instruction definitions.
+ const MachineOperand *Data0 =
+ TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
+ const MachineOperand *Data1 =
+ TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
+
+ const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI));
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
+ AMDGPU::OpName::data0);
+ int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
+ AMDGPU::OpName::data1);
+
+ const TargetRegisterClass *DataRC0 =
+ TII->getRegClass(Write2Opc, Data0Idx, TRI, *MF);
+
+ const TargetRegisterClass *DataRC1 =
+ TII->getRegClass(Write2Opc, Data1Idx, TRI, *MF);
+
+ if (unsigned SubReg = Data0->getSubReg()) {
+ DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
+ DataRC0, SubReg);
+ }
+
+ if (unsigned SubReg = Data1->getSubReg()) {
+ DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()),
+ DataRC1, SubReg);
+ }
+
+ if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) ||
+ !MRI->constrainRegClass(Data1->getReg(), DataRC1))
+ return nullptr;
+
+ // TODO: If one register can be constrained, and not the other, insert a
+ // copy.
+ }
+
return Where;
}
@@ -1462,6 +1509,10 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
: AMDGPU::DS_WRITE2ST64_B64_gfx9;
}
+unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {
+ return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+}
+
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
@@ -1478,8 +1529,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
unsigned NewOffset0 = CI.Offset;
unsigned NewOffset1 = Paired.Offset;
- unsigned Opc =
- CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+ unsigned Opc = getWrite2Opcode(CI);
if (NewOffset0 > NewOffset1) {
// Canonicalize the merged instruction so the smaller offset comes first.
@@ -2032,6 +2082,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
}
}
+ // FIXME: This should compute the instruction to use, and then use the result
+ // of TII->getRegClass.
unsigned BitWidth = 32 * (CI.Width + Paired.Width);
return TRI->isAGPRClass(getDataRegClass(*CI.I))
? TRI->getAGPRClassForBitWidth(BitWidth)
@@ -2400,7 +2452,6 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
std::list<std::list<CombineInfo> > &MergeableInsts) const {
for (std::list<CombineInfo> &AddrList : MergeableInsts) {
if (AddrList.front().InstClass == CI.InstClass &&
- AddrList.front().IsAGPR == CI.IsAGPR &&
AddrList.front().hasSameBaseAddress(CI)) {
AddrList.emplace_back(CI);
return;
@@ -2465,16 +2516,6 @@ SILoadStoreOptimizer::collectMergeableInsts(
if (!CI.hasMergeableAddress(*MRI))
continue;
- if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
- // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
- // operands. However we are reporting that ds_write2 shall have
- // only VGPR data so that machine copy propagation does not
- // create an illegal instruction with a VGPR and AGPR sources.
- // Consequenctially if we create such instruction the verifier
- // will complain.
- continue;
- }
-
LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
addInstToMergeableList(CI, MergeableInsts);
@@ -2647,6 +2688,7 @@ bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
}
bool SILoadStoreOptimizer::run(MachineFunction &MF) {
+ this->MF = &MF;
STM = &MF.getSubtarget<GCNSubtarget>();
if (!STM->loadStoreOptEnabled())
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 53f554eccb1f..1637c06936f9 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -63,6 +63,7 @@ enum class SIAtomicScope {
SINGLETHREAD,
WAVEFRONT,
WORKGROUP,
+ CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
AGENT,
SYSTEM
};
@@ -103,8 +104,10 @@ private:
bool IsVolatile = false;
bool IsNonTemporal = false;
bool IsLastUse = false;
+ bool IsCooperative = false;
SIMemOpInfo(
+ const GCNSubtarget &ST,
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
@@ -112,14 +115,15 @@ private:
bool IsCrossAddressSpaceOrdering = true,
AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
bool IsVolatile = false, bool IsNonTemporal = false,
- bool IsLastUse = false)
+ bool IsLastUse = false, bool IsCooperative = false)
: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
- IsLastUse(IsLastUse) {
+ IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
if (Ordering == AtomicOrdering::NotAtomic) {
+ assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
assert(Scope == SIAtomicScope::NONE &&
OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
!IsCrossAddressSpaceOrdering &&
@@ -154,6 +158,11 @@ private:
SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
this->Scope = std::min(Scope, SIAtomicScope::AGENT);
}
+
+ // On targets that have no concept of a workgroup cluster, use
+ // AGENT scope as a conservatively correct alternative.
+ if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
+ this->Scope = SIAtomicScope::AGENT;
}
public:
@@ -209,6 +218,9 @@ public:
/// create this SIMemOpInfo is last use, false otherwise.
bool isLastUse() const { return IsLastUse; }
+ /// \returns True if this is a cooperative load or store atomic.
+ bool isCooperative() const { return IsCooperative; }
+
/// \returns True if ordering constraint of the machine instruction used to
/// create this SIMemOpInfo is unordered or higher, false otherwise.
bool isAtomic() const {
@@ -220,6 +232,7 @@ public:
class SIMemOpAccess final {
private:
const AMDGPUMachineModuleInfo *MMI = nullptr;
+ const GCNSubtarget &ST;
/// Reports unsupported message \p Msg for \p MI to LLVM context.
void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -243,7 +256,7 @@ private:
public:
/// Construct class to support accessing the machine memory operands
/// of instructions in the machine function \p MF.
- SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
+ SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
std::optional<SIMemOpInfo>
@@ -325,6 +338,12 @@ public:
return false;
};
+ /// Handle cooperative load/store atomics.
+ virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
+ llvm_unreachable(
+ "cooperative atomics are not available on this architecture");
+ }
+
/// Inserts any necessary instructions at position \p Pos relative
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
/// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -359,6 +378,12 @@ public:
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
+ /// Inserts any necessary instructions before the barrier start instruction
+ /// \p MI in order to support pairing of barriers and fences.
+ virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
+ return false;
+ };
+
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
};
@@ -547,6 +572,8 @@ public:
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
+
+ bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
};
class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -587,7 +614,11 @@ protected:
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
public:
- SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+ SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+ // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
+ // the behavior is the same if assuming GFX12.0 in CU mode.
+ assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
+ }
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -604,6 +635,8 @@ public:
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
+ virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
@@ -748,6 +781,8 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
if (SSID == MMI->getAgentSSID())
return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
+ if (SSID == MMI->getClusterSSID())
+ return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
if (SSID == MMI->getWorkgroupSSID())
return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
true);
@@ -763,6 +798,9 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::tuple(SIAtomicScope::AGENT,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
+ if (SSID == MMI->getClusterOneAddressSpaceSSID())
+ return std::tuple(SIAtomicScope::CLUSTER,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::tuple(SIAtomicScope::WORKGROUP,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
@@ -790,8 +828,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
return SIAtomicAddrSpace::OTHER;
}
-SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
- : MMI(&MMI_) {}
+SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
+ const GCNSubtarget &ST)
+ : MMI(&MMI_), ST(ST) {}
std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
const MachineBasicBlock::iterator &MI) const {
@@ -804,6 +843,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
bool IsNonTemporal = true;
bool IsVolatile = false;
bool IsLastUse = false;
+ bool IsCooperative = false;
// Validator should check whether or not MMOs cover the entire set of
// locations accessed by the memory instruction.
@@ -811,6 +851,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
IsNonTemporal &= MMO->isNonTemporal();
IsVolatile |= MMO->isVolatile();
IsLastUse |= MMO->getFlags() & MOLastUse;
+ IsCooperative |= MMO->getFlags() & MOCooperative;
InstrAddrSpace |=
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
@@ -850,9 +891,9 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
return std::nullopt;
}
}
- return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+ return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
- IsNonTemporal, IsLastUse);
+ IsNonTemporal, IsLastUse, IsCooperative);
}
std::optional<SIMemOpInfo>
@@ -864,7 +905,7 @@ SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo();
+ return SIMemOpInfo(ST);
return constructFromMIWithMMO(MI);
}
@@ -878,7 +919,7 @@ SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo();
+ return SIMemOpInfo(ST);
return constructFromMIWithMMO(MI);
}
@@ -919,8 +960,9 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
if (SynchronizeAS)
OrderingAddrSpace = *SynchronizeAS;
- return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
- IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
+ return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
+ SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
+ AtomicOrdering::NotAtomic);
}
std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -932,7 +974,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
- return SIMemOpInfo();
+ return SIMemOpInfo(ST);
return constructFromMIWithMMO(MI);
}
@@ -2169,6 +2211,22 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx10CacheControl::insertBarrierStart(
+ MachineBasicBlock::iterator &MI) const {
+ // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
+ // mode. This is because a CU mode release fence does not emit any wait, which
+ // is fine when only dealing with vmem, but isn't sufficient in the presence
+ // of barriers which do not go through vmem.
+ // GFX12.5 does not require this additional wait.
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+ return true;
+}
+
bool SIGfx11CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
@@ -2334,18 +2392,23 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
+ case SIAtomicScope::CLUSTER:
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
STORECnt |= true;
break;
case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore need to wait for operations to complete to ensure
- // they are visible to waves in the other CU as the L0 is per CU.
- // Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
- if (!ST.isCuModeEnabled()) {
+ // GFX12.0:
+ // In WGP mode the waves of a work-group can be executing on either CU
+ // of the WGP. Therefore need to wait for operations to complete to
+ // ensure they are visible to waves in the other CU as the L0 is per CU.
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
+ // which shares the same L0.
+ //
+ // GFX12.5:
+ // TODO DOCS
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2366,6 +2429,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
+ case SIAtomicScope::CLUSTER:
case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
@@ -2397,7 +2461,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
//
// This also applies to fences. Fences cannot pair with an instruction
// tracked with bvh/samplecnt as we don't have any atomics that do that.
- if (Order != AtomicOrdering::Acquire) {
+ if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
}
@@ -2448,11 +2512,18 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
case SIAtomicScope::AGENT:
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
break;
+ case SIAtomicScope::CLUSTER:
+ ScopeImm = AMDGPU::CPol::SCOPE_SE;
+ break;
case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore we need to invalidate the L0 which is per CU.
- // Otherwise in CU mode all waves of a work-group are on the same CU, and so
- // the L0 does not need to be invalidated.
+ // GFX12.0:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore we need to invalidate the L0 which is per CU.
+ // Otherwise in CU mode all waves of a work-group are on the same CU, and
+ // so the L0 does not need to be invalidated.
+ //
+ // GFX12.5
+ // TODO DOCS
if (ST.isCuModeEnabled())
return false;
@@ -2497,7 +2568,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- // global_wb is only necessary at system scope for gfx120x targets.
+ // global_wb is only necessary at system scope for GFX12.0,
+ // they're also necessary at device scope for GFX12.5.
//
// Emitting it for lower scopes is a slow no-op, so we omit it
// for performance.
@@ -2507,6 +2579,13 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
.addImm(AMDGPU::CPol::SCOPE_SYS);
break;
case SIAtomicScope::AGENT:
+ // TODO DOCS
+ if (ST.hasGFX1250Insts()) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
+ .addImm(AMDGPU::CPol::SCOPE_DEV);
+ }
+ break;
+ case SIAtomicScope::CLUSTER:
case SIAtomicScope::WORKGROUP:
// No WB necessary, but we still have to wait.
break;
@@ -2569,26 +2648,44 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
}
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
- MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
- if (!CPol)
- return false;
+ assert(MI.mayStore() && "Not a Store inst");
+ const bool IsRMW = (MI.mayLoad() && MI.mayStore());
+ bool Changed = false;
+ // GFX12.5 only: xcnt wait is needed before flat and global atomics
+ // stores/rmw.
+ if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ // Remaining fixes do not apply to RMWs.
+ if (IsRMW)
+ return Changed;
+
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ if (!CPol) // Some vmem operations do not have a scope and are not concerned.
+ return Changed;
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
// GFX12.0 only: Extra waits needed before system scope stores.
- if (!ST.hasGFX1250Insts()) {
- if (!Atomic && Scope == CPol::SCOPE_SYS)
- return insertWaitsBeforeSystemScopeStore(MI);
- return false;
- }
+ if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
+ Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
- // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
- // space.
- // We also require SCOPE_SE minimum if we not have the "cu-stores" feature.
- if (Scope == CPol::SCOPE_CU &&
- (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
- return setScope(MI, CPol::SCOPE_SE);
+ return Changed;
+}
+bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
+ if (!ST.hasGFX1250Insts())
+ return false;
+
+ // Cooperative atomics need to be SCOPE_DEV or higher.
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ assert(CPol && "No CPol operand?");
+ const unsigned Scope = CPol->getImm() & CPol::SCOPE;
+ if (Scope < CPol::SCOPE_DEV)
+ return setScope(MI, CPol::SCOPE_DEV);
return false;
}
@@ -2605,6 +2702,9 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
case SIAtomicScope::AGENT:
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
break;
+ case SIAtomicScope::CLUSTER:
+ Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
+ break;
case SIAtomicScope::WORKGROUP:
// In workgroup mode, SCOPE_SE is needed as waves can executes on
// different CUs that access different L0s.
@@ -2656,6 +2756,11 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
MOI.getOrderingAddrSpace());
}
+ // Handle cooperative atomics after cache bypass step, as it may override
+ // the scope of the instruction to a greater scope.
+ if (MOI.isCooperative())
+ Changed |= CC->handleCooperativeAtomic(*MI);
+
if (Order == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
@@ -2701,6 +2806,11 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getOrderingAddrSpace());
}
+ // Handle cooperative atomics after cache bypass step, as it may override
+ // the scope of the instruction to a greater scope.
+ if (MOI.isCooperative())
+ Changed |= CC->handleCooperativeAtomic(*MI);
+
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertRelease(MI, MOI.getScope(),
@@ -2778,6 +2888,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
+ MachineInstr &RMWMI = *MI;
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
@@ -2812,6 +2923,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
Position::AFTER);
}
+ Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
return Changed;
}
@@ -2839,8 +2951,9 @@ SIMemoryLegalizerPass::run(MachineFunction &MF,
bool SIMemoryLegalizer::run(MachineFunction &MF) {
bool Changed = false;
- SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
- CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
+ CC = SICacheControl::create(ST);
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
@@ -2860,6 +2973,11 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
MI = II->getIterator();
}
+ if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
+ Changed |= CC->insertBarrierStart(MI);
+ continue;
+ }
+
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index d0cba30a442b..857cb91a977f 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -291,21 +291,7 @@ static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
if (!Reg->isReg() || !Reg->isDef())
return nullptr;
- MachineOperand *ResMO = nullptr;
- for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
- // If there exist use of subreg of Reg then return nullptr
- if (!isSameReg(UseMO, *Reg))
- return nullptr;
-
- // Check that there is only one instruction that uses Reg
- if (!ResMO) {
- ResMO = &UseMO;
- } else if (ResMO->getParent() != UseMO.getParent()) {
- return nullptr;
- }
- }
-
- return ResMO;
+ return MRI->getOneNonDBGUse(Reg->getReg());
}
static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
@@ -313,17 +299,7 @@ static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
if (!Reg->isReg())
return nullptr;
- MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
- if (!DefInstr)
- return nullptr;
-
- for (auto &DefMO : DefInstr->defs()) {
- if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
- return &DefMO;
- }
-
- // Ignore implicit defs.
- return nullptr;
+ return MRI->getOneDef(Reg->getReg());
}
/// Combine an SDWA instruction's existing SDWA selection \p Sel with
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index efdc55b8e68b..5720b978aada 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -184,9 +184,11 @@ bool SIPostRABundler::run(MachineFunction &MF) {
if (I->getNumExplicitDefs() != 0)
Defs.insert(I->defs().begin()->getReg());
++ClauseLength;
- } else if (!I->isMetaInstruction()) {
- // Allow meta instructions in between bundle candidates, but do not
- // start or end a bundle on one.
+ } else if (!I->isMetaInstruction() ||
+ I->getOpcode() == AMDGPU::SCHED_BARRIER) {
+ // SCHED_BARRIER is not bundled to be honored by scheduler later.
+ // Allow other meta instructions in between bundle candidates, but do
+ // not start or end a bundle on one.
//
// TODO: It may be better to move meta instructions like dbg_value
// after the bundle. We're relying on the memory legalizer to unbundle
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ae0f304ea304..22488384759b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3273,6 +3273,10 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
return AMDGPUInstPrinter::getRegisterName(Reg);
}
+unsigned SIRegisterInfo::getHWRegIndex(MCRegister Reg) const {
+ return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
+}
+
unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) {
return getRegBitWidth(RC.getID());
}
@@ -3353,6 +3357,40 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
: getAnyVGPRClassForBitWidth(BitWidth);
}
+const TargetRegisterClass *
+SIRegisterInfo::getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const {
+ if (BitWidth <= 32)
+ return &AMDGPU::VGPR_32_Lo256RegClass;
+ if (BitWidth <= 64)
+ return &AMDGPU::VReg_64_Lo256_Align2RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::VReg_96_Lo256_Align2RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::VReg_128_Lo256_Align2RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::VReg_160_Lo256_Align2RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::VReg_192_Lo256_Align2RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::VReg_224_Lo256_Align2RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::VReg_256_Lo256_Align2RegClass;
+ if (BitWidth <= 288)
+ return &AMDGPU::VReg_288_Lo256_Align2RegClass;
+ if (BitWidth <= 320)
+ return &AMDGPU::VReg_320_Lo256_Align2RegClass;
+ if (BitWidth <= 352)
+ return &AMDGPU::VReg_352_Lo256_Align2RegClass;
+ if (BitWidth <= 384)
+ return &AMDGPU::VReg_384_Lo256_Align2RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::VReg_512_Lo256_Align2RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
+
+ return nullptr;
+}
+
static const TargetRegisterClass *
getAnyAGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth == 64)
@@ -3547,7 +3585,17 @@ bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
const TargetRegisterClass *
SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
unsigned Size = getRegSizeInBits(*SRC);
- const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
+
+ switch (SRC->getID()) {
+ default:
+ break;
+ case AMDGPU::VS_32_Lo256RegClassID:
+ case AMDGPU::VS_64_Lo256RegClassID:
+ return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
+ }
+
+ const TargetRegisterClass *VRC =
+ getAllocatableClass(getVGPRClassForBitWidth(Size));
assert(VRC && "Invalid register class size");
return VRC;
}
@@ -3708,14 +3756,15 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const {
- if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
- Idx == AMDGPU::RegisterPressureSets::AGPR_32)
+ switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
+ case AMDGPU::RegisterPressureSets::VGPR_32:
+ case AMDGPU::RegisterPressureSets::AGPR_32:
return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
const_cast<MachineFunction &>(MF));
-
- if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
+ case AMDGPU::RegisterPressureSets::SReg_32:
return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
const_cast<MachineFunction &>(MF));
+ }
llvm_unreachable("Unexpected register pressure set!");
}
@@ -3944,6 +3993,8 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
return RC.hasSuperClassEq(
getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
+ assert(&RC != &AMDGPU::VS_64RegClass);
+
return true;
}
@@ -3956,6 +4007,9 @@ SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
if (Size <= 32)
return RC;
+ if (RC == &AMDGPU::VS_64RegClass)
+ return &AMDGPU::VS_64_Align2RegClass;
+
if (isVGPRClass(RC))
return getAlignedVGPRClassForBitWidth(Size);
if (isAGPRClass(RC))
@@ -4000,7 +4054,12 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC,
bool IncludeCalls) const {
- for (MCPhysReg Reg : reverse(RC.getRegisters()))
+ unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
+ ArrayRef<MCPhysReg> Registers =
+ (RC.getID() == AMDGPU::VGPR_32RegClassID)
+ ? RC.getRegisters().take_front(NumArchVGPRs)
+ : RC.getRegisters();
+ for (MCPhysReg Reg : reverse(Registers))
if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
return getHWRegIndex(Reg) + 1;
return 0;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 5508f07b1b5f..eeefef1116aa 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -200,13 +200,14 @@ public:
StringRef getRegAsmName(MCRegister Reg) const override;
// Pseudo regs are not allowed
- unsigned getHWRegIndex(MCRegister Reg) const {
- return getEncodingValue(Reg) & 0xff;
- }
+ unsigned getHWRegIndex(MCRegister Reg) const;
LLVM_READONLY
const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth) const;
+ LLVM_READONLY const TargetRegisterClass *
+ getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const;
+
LLVM_READONLY
const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 0293d4018770..5f5eec49bab0 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -76,17 +76,17 @@ class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC,
//===----------------------------------------------------------------------===//
// Declarations that describe the SI registers
//===----------------------------------------------------------------------===//
-class SIReg <string n, bits<8> regIdx = 0, bit isVGPR = 0,
+class SIReg <string n, bits<10> regIdx = 0, bit isVGPR = 0,
bit isAGPR = 0, bit isHi16 = 0> : Register<n> {
let Namespace = "AMDGPU";
// These are generic helper values we use to form actual register
// codes. They should not be assumed to match any particular register
// encodings on any particular subtargets.
- let HWEncoding{7-0} = regIdx;
- let HWEncoding{8} = isVGPR;
- let HWEncoding{9} = isAGPR;
- let HWEncoding{10} = isHi16;
+ let HWEncoding{9-0} = regIdx;
+ let HWEncoding{10} = isVGPR;
+ let HWEncoding{11} = isAGPR;
+ let HWEncoding{12} = isHi16;
int Index = !cast<int>(regIdx);
}
@@ -110,17 +110,17 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
let TSFlags{3} = HasAGPR;
let TSFlags{4} = HasSGPR;
- // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
+ // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
// to decide which registers to try to assign first. Usually, this RegisterClass priority is given
// very high priority, if not the highest priority, when considering which VirtReg to allocate next.
//
- // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
- // assign more constrained RegisterClasses first. As a result, we prioritize register classes with
- // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
- //
+ // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
+ // assign more constrained RegisterClasses first. As a result, we prioritize register classes with
+ // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
+ //
// The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
// In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
- // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
+ // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
// RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
// is used for scaling of the bit (i.e. 1 << 4).
field int BaseClassPriority = 1;
@@ -128,7 +128,7 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
}
-multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
+multiclass SIRegLoHi16 <string n, bits<10> regIdx, bit ArtificialHigh = 1,
bit isVGPR = 0, bit isAGPR = 0,
list<int> DwarfEncodings = [-1, -1]> {
def _LO16 : SIReg<n#".l", regIdx, isVGPR, isAGPR>;
@@ -142,9 +142,10 @@ multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
let Namespace = "AMDGPU";
let SubRegIndices = [lo16, hi16];
let CoveredBySubRegs = !not(ArtificialHigh);
- let HWEncoding{7-0} = regIdx;
- let HWEncoding{8} = isVGPR;
- let HWEncoding{9} = isAGPR;
+
+ let HWEncoding{9-0} = regIdx;
+ let HWEncoding{10} = isVGPR;
+ let HWEncoding{11} = isAGPR;
int Index = !cast<int>(regIdx);
}
@@ -225,7 +226,7 @@ def SGPR_NULL64 :
// the high 32 bits. The lower 32 bits are always zero (for base) or
// -1 (for limit). Since we cannot access the high 32 bits, when we
// need them, we need to do a 64 bit load and extract the bits manually.
-multiclass ApertureRegister<string name, bits<8> regIdx> {
+multiclass ApertureRegister<string name, bits<10> regIdx> {
let isConstant = true in {
// FIXME: We shouldn't need to define subregisters for these (nor add them to any 16 bit
// register classes), but if we don't it seems to confuse the TableGen
@@ -313,7 +314,7 @@ foreach Index = 0...15 in {
defm TTMP#Index : SIRegLoHi16<"ttmp"#Index, 0>;
}
-multiclass FLAT_SCR_LOHI_m <string n, bits<8> ci_e, bits<8> vi_e> {
+multiclass FLAT_SCR_LOHI_m <string n, bits<10> ci_e, bits<10> vi_e> {
defm _ci : SIRegLoHi16<n, ci_e>;
defm _vi : SIRegLoHi16<n, vi_e>;
defm "" : SIRegLoHi16<n, 0>;
@@ -343,11 +344,12 @@ foreach Index = 0...105 in {
}
// VGPR registers
-foreach Index = 0...255 in {
+foreach Index = 0...1023 in {
defm VGPR#Index :
SIRegLoHi16 <"v"#Index, Index, /*ArtificialHigh=*/ 0,
/*isVGPR=*/ 1, /*isAGPR=*/ 0, /*DwarfEncodings=*/
- [!add(Index, 2560), !add(Index, 1536)]>;
+ [!if(!le(Index, 511), !add(Index, 2560), -1),
+ !if(!le(Index, 511), !add(Index, 1536), !add(Index, !sub(3584, 512)))]>;
}
// AccVGPR registers
@@ -604,15 +606,15 @@ def Reg512Types : RegisterTypes<[v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v
def Reg1024Types : RegisterTypes<[v32i32, v32f32, v16i64, v16f64]>;
let HasVGPR = 1 in {
-// VOP3 and VINTERP can access 256 lo and 256 hi registers.
+// VOP3 and VINTERP can access 1024 lo and 1024 hi registers.
def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
- (add (interleave (sequence "VGPR%u_LO16", 0, 255),
- (sequence "VGPR%u_HI16", 0, 255)))> {
+ (add (interleave (sequence "VGPR%u_LO16", 0, 1023),
+ (sequence "VGPR%u_HI16", 0, 1023)))> {
let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 16;
let GeneratePressureSet = 0;
- // This is the base class for VGPR{128..255}_{LO16,HI16}.
+ // This is the base class for VGPR{128..1023}_{LO16,HI16}.
let BaseClassOrder = 17;
}
@@ -633,7 +635,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// VGPR 32-bit registers
// i16/f16 only on VI+
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
- (add (sequence "VGPR%u", 0, 255))> {
+ (add (sequence "VGPR%u", 0, 1023))> {
let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
@@ -648,46 +650,55 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
let Size = 32;
let Weight = 1;
}
+
+// Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
+def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
+ (add (sequence "VGPR%u", 0, 255))> {
+ let AllocationPriority = 0;
+ let GeneratePressureSet = 0;
+ let Size = 32;
+ let Weight = 1;
+}
} // End HasVGPR = 1
// VGPR 64-bit registers
-def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 255, 1, 2, "v">;
+def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 1023, 1, 2, "v">;
// VGPR 96-bit registers
-def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 255, 1, 3, "v">;
+def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 1023, 1, 3, "v">;
// VGPR 128-bit registers
-def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">;
+def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 1023, 1, 4, "v">;
// VGPR 160-bit registers
-def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
+def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 1023, 1, 5, "v">;
// VGPR 192-bit registers
-def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">;
+def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 1023, 1, 6, "v">;
// VGPR 224-bit registers
-def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">;
+def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 1023, 1, 7, "v">;
// VGPR 256-bit registers
-def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
+def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 1023, 1, 8, "v">;
// VGPR 288-bit registers
-def VGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, VGPR_32, 255, 1, 9, "v">;
+def VGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, VGPR_32, 1023, 1, 9, "v">;
// VGPR 320-bit registers
-def VGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, VGPR_32, 255, 1, 10, "v">;
+def VGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, VGPR_32, 1023, 1, 10, "v">;
// VGPR 352-bit registers
-def VGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, VGPR_32, 255, 1, 11, "v">;
+def VGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, VGPR_32, 1023, 1, 11, "v">;
// VGPR 384-bit registers
-def VGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, VGPR_32, 255, 1, 12, "v">;
+def VGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, VGPR_32, 1023, 1, 12, "v">;
// VGPR 512-bit registers
-def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
+def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 1023, 1, 16, "v">;
// VGPR 1024-bit registers
-def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">;
+def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 1023, 1, 32, "v">;
let HasAGPR = 1 in {
def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
@@ -976,14 +987,14 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
// Requires n v_mov_b32 to copy
let CopyCost = numRegs;
- // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
- // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
- // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
- // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one
- // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
- // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
+ // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
+ // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
+ // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
+ // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one
+ // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
+ // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
-
+
let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
let Weight = numRegs;
}
@@ -1003,6 +1014,10 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
let BaseClassOrder = !sub(!mul(numRegs, 32), 1);
let RegTupleAlignUnits = 2;
}
+
+ // Aligned register tuples starting with low 256 vgprs
+ def _Lo256_Align2 : VRegClassBase<numRegs, regTypes,
+ (trunc (decimate regList, 2), !div(!sub(258, numRegs), 2))>;
}
}
@@ -1100,6 +1115,14 @@ def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2
let Size = 32;
}
+def VS_32_Lo256 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
+ (add VGPR_32_Lo256, SReg_32, LDS_DIRECT_CLASS)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+ let HasSGPR = 1;
+ let Size = 32;
+}
+
def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_64)> {
let isAllocatable = 0;
let HasVGPR = 1;
@@ -1107,12 +1130,27 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
let Size = 64;
}
+def VS_64_Align2 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
+ (add VReg_64_Align2, SReg_64)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+ let HasSGPR = 1;
+ let Size = 64;
+}
+
def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
let HasVGPR = 1;
let HasAGPR = 1;
let BaseClassPriority = 0;
let Size = 32;
}
+
+def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64_Lo256_Align2, SReg_64)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+ let HasSGPR = 1;
+ let Size = 64;
+}
} // End GeneratePressureSet = 0
// Define a register tuple class, along with one requiring an even
@@ -1249,15 +1287,15 @@ class SrcReg9<RegisterClass regClass> : RegisterOperand<regClass> {
let DecoderMethod = "decodeSrcReg9<" # regClass.Size # ">";
}
-def VRegSrc_32 : SrcReg9<VGPR_32>;
-def VRegSrc_64 : SrcReg9<VReg_64>;
-def VRegSrc_96 : SrcReg9<VReg_96>;
-def VRegSrc_128: SrcReg9<VReg_128>;
-def VRegSrc_192: SrcReg9<VReg_192>;
-def VRegSrc_256: SrcReg9<VReg_256>;
-def VRegSrc_384: SrcReg9<VReg_384>;
-def VRegSrc_512: SrcReg9<VReg_512>;
-def VRegSrc_1024: SrcReg9<VReg_1024>;
+def VRegSrc_32 : SrcReg9<VGPR_32>;
+def VRegSrc_64 : SrcReg9<VReg_64>;
+def VRegSrc_96 : SrcReg9<VReg_96>;
+def VRegSrc_128 : SrcReg9<VReg_128>;
+def VRegSrc_192 : SrcReg9<VReg_192>;
+def VRegSrc_256 : SrcReg9<VReg_256>;
+def VRegSrc_384 : SrcReg9<VReg_384>;
+def VRegSrc_512 : SrcReg9<VReg_512>;
+def VRegSrc_1024 : SrcReg9<VReg_1024>;
def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;
// True 16 Operands
@@ -1269,30 +1307,41 @@ def VRegSrc_fake16: SrcReg9<VGPR_32> {
let EncoderMethod = "getMachineOpValueT16";
}
//===----------------------------------------------------------------------===//
-// VGPRSrc_*
+// VGPROp_* An 8-bit RegisterOperand wrapper for a VGPR
//===----------------------------------------------------------------------===//
-// An 8-bit RegisterOperand wrapper for a VGPR
-def VGPRSrc_32 : RegisterOperand<VGPR_32> {
- let DecoderMethod = "DecodeVGPR_32RegisterClass";
+class VGPROp<RegisterClass regClass> : RegisterOperand<regClass> {
+ let DecoderMethod = "Decode" # regClass # "RegisterClass";
}
-def VGPRSrc_32_Lo128 : RegisterOperand<VGPR_32_Lo128> {
- let DecoderMethod = "DecodeVGPR_32RegisterClass";
+class VGPROp_Align2<RegisterClass regClass> : RegisterOperand<!cast<RegisterClass>(regClass#_Align2)> {
+ let DecoderMethod = "Decode" # regClass # "RegisterClass";
+}
+multiclass VGPROp_Aligned<RegisterClass regClass> {
+ def _Align1 : VGPROp<regClass>;
+ def _Align2 : VGPROp_Align2<regClass>;
}
-def VGPRSrc_96 : RegisterOperand<VReg_96> {
- let DecoderMethod = "DecodeVReg_96RegisterClass";
+// TODO: These cases should use default target alignment
+def VGPROp_16 : VGPROp<VGPR_16> {
+ let EncoderMethod = "getMachineOpValueT16";
}
+def VGPROp_32 : VGPROp<VGPR_32>;
-def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
+foreach size = ["64", "96", "128", "160", "192", "224", "256", "288", "512", "1024"] in {
+ def VGPROp_#size : VGPROp<!cast<RegisterClass>("VReg_"#size)>;
+}
+
+foreach size = ["64", "96", "128", "160", "256", "1024"] in {
+ defm VGPROp_#size : VGPROp_Aligned<!cast<RegisterClass>("VReg_"#size)>;
+}
+
+def VGPROp_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass";
let EncoderMethod = "getMachineOpValueT16Lo128";
}
-// True 16 operands.
-def VGPRSrc_16 : RegisterOperand<VGPR_16> {
- let DecoderMethod = "DecodeVGPR_16RegisterClass";
- let EncoderMethod = "getMachineOpValueT16";
+def VGPROp_32_Lo128 : RegisterOperand<VGPR_32_Lo128> {
+ let DecoderMethod = "DecodeVGPR_32RegisterClass";
}
//===----------------------------------------------------------------------===//
@@ -1321,7 +1370,9 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">;
def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
+def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">;
def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">;
+def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">;
// True 16 Operands
def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;
@@ -1372,11 +1423,14 @@ class AVLdStOperand<RegisterClass regClass>
: AVOperand<regClass, "decodeAVLdSt">;
def AVLdSt_32 : AVLdStOperand<AV_32>;
-def AVLdSt_64 : AVLdStOperand<AV_64>;
-def AVLdSt_96 : AVLdStOperand<AV_96>;
-def AVLdSt_128 : AVLdStOperand<AV_128>;
-def AVLdSt_160 : AVLdStOperand<AV_160>;
-def AVLdSt_1024 : AVLdStOperand<AV_1024>;
+
+foreach size = ["64", "96", "128", "160", "256", "1024" ] in {
+ // TODO: These cases should use target align variant
+ def AVLdSt_#size : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>;
+
+ def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>;
+ def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClass>("AV_"#size#_Align2)>;
+}
//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
@@ -1395,3 +1449,59 @@ def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_FP32">;
def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_INT32">;
def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_FP32">;
def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">;
+
+//===----------------------------------------------------------------------===//
+// Tablegen programming utilities
+//===----------------------------------------------------------------------===//
+
+/// Helper function to extract the register class from an
+/// instruction's operand list, which may be a RegisterOperand or a
+/// direct RegisterClass reference.
+class getRegClassFromOp<DAGOperand Op> {
+ SIRegisterClass ret = !if(
+ !isa<RegisterOperand>(Op),
+ !cast<SIRegisterClass>(!cast<RegisterOperand>(Op).RegClass),
+ !cast<SIRegisterClass>(Op));
+}
+
+/// Check if the operand will use an AV_* class.
+class OperandIsAV<DAGOperand Op> {
+ defvar reg_class = getRegClassFromOp<Op>.ret;
+ bit ret = !and(reg_class.HasAGPR, reg_class.HasVGPR);
+}
+
+/// Check if the operand will use an AGPR class.
+class OperandIsAGPR<DAGOperand Op> {
+ defvar reg_class = getRegClassFromOp<Op>.ret;
+ bit ret = !and(reg_class.HasAGPR, !not(reg_class.HasVGPR));
+}
+
+/// Check if the operand will use a VGPR class.
+class OperandIsVGPR<DAGOperand Op> {
+ defvar reg_class = getRegClassFromOp<Op>.ret;
+ bit ret = !and(reg_class.HasVGPR, !not(reg_class.HasAGPR));
+}
+
+class VDstOperandIsAV<dag OperandList> {
+ bit ret = OperandIsAV<!getdagarg<DAGOperand>(OperandList, "vdst")>.ret;
+}
+
+class VDstOperandIsAGPR<dag OperandList> {
+ bit ret = OperandIsAGPR<!getdagarg<DAGOperand>(OperandList, "vdst")>.ret;
+}
+
+class Data0OperandIsAV<dag OperandList> {
+ bit ret = OperandIsAV<!getdagarg<DAGOperand>(OperandList, "data0")>.ret;
+}
+
+class Data0OperandIsAGPR<dag OperandList> {
+ bit ret = OperandIsAGPR<!getdagarg<DAGOperand>(OperandList, "data0")>.ret;
+}
+
+class VDataOperandIsAV<dag OperandList> {
+ bit ret = OperandIsAV<!getdagarg<DAGOperand>(OperandList, "vdata")>.ret;
+}
+
+class VDataOperandIsAGPR<dag OperandList> {
+ bit ret = OperandIsAGPR<!getdagarg<DAGOperand>(OperandList, "vdata")>.ret;
+}
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 4bda51d1e959..781c61b073db 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -295,7 +295,6 @@ class SM_Pseudo_Atomic<string opName,
let has_soffset = offsets.HasSOffset;
let Constraints = !if(isRet, "$sdst = $sdata", "");
- let DisableEncoding = !if(isRet, "$sdata", "");
}
multiclass SM_Pseudo_Atomics<RegisterClass baseClass,
@@ -678,7 +677,6 @@ class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
bits<7> sdata;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let cpol{CPolBit.GLC} = ps.glc;
let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0});
@@ -1295,7 +1293,6 @@ class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
bits<7> sdata;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let cpol{CPolBit.GLC} = ps.glc;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index a003a46191a8..12a27db241c4 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -583,7 +583,6 @@ class SOP2_Real<SOP_Pseudo ps, string name = ps.Mnemonic> :
let mayLoad = ps.mayLoad;
let mayStore = ps.mayStore;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let Uses = ps.Uses;
let Defs = ps.Defs;
let isConvergent = ps.isConvergent;
@@ -934,7 +933,7 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
>;
} // End isReMaterializable = 1
- let Constraints = "$sdst = $src2", DisableEncoding="$src2",
+ let Constraints = "$sdst = $src2",
isCommutable = 1, AddedComplexity = 20 in {
def S_FMAC_F32 : SOP2_Pseudo<
"s_fmac_f32", (outs SReg_32:$sdst),
@@ -949,7 +948,7 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
"$sdst, $src0, $src1",
[(set f16:$sdst, (UniformTernaryFrag<any_fma> SSrc_f16:$src0, SSrc_f16:$src1, SReg_32:$src2))]
>;
- } // End Constraints = "$sdst = $src2", DisableEncoding="$src2",
+ } // End Constraints = "$sdst = $src2",
// isCommutable = 1, AddedComplexity = 20
} // End SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
// Uses = [MODE], SchedRW = [WriteSFPU]
@@ -994,7 +993,6 @@ class SOPK_Real<SOPK_Pseudo ps, string name = ps.Mnemonic> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
- let DisableEncoding = ps.DisableEncoding;
let Constraints = ps.Constraints;
let SchedRW = ps.SchedRW;
let mayLoad = ps.mayLoad;
@@ -1116,8 +1114,7 @@ def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>;
def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>;
} // End isCompare = 1
-let isCommutable = 1, DisableEncoding = "$src0",
- Constraints = "$sdst = $src0" in {
+let isCommutable = 1, Constraints = "$sdst = $src0" in {
let Defs = [SCC] in
def S_ADDK_I32 : SOPK_32TIE <"s_addk_i32">;
def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">;
@@ -1656,6 +1653,11 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+
+let SubtargetPredicate = HasWaitXcnt in {
+ def S_WAIT_XCNT_soft : SOPP_Pseudo<"", (ins s16imm:$simm16), "$simm16">;
+}
+
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
@@ -1847,6 +1849,13 @@ let SubtargetPredicate = HasWaitXcnt, hasSideEffects = 1 in {
SOPP_Pseudo<"s_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
} // End SubtargetPredicate = hasWaitXcnt, hasSideEffects = 1
+let SubtargetPredicate = Has1024AddressableVGPRs in {
+ def S_SET_VGPR_MSB : SOPP_Pseudo<"s_set_vgpr_msb" , (ins i16imm:$simm16), "$simm16"> {
+ let hasSideEffects = 1;
+ let Defs = [MODE];
+ }
+}
+
//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//
@@ -2694,6 +2703,7 @@ defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12<0x049>;
//===----------------------------------------------------------------------===//
// SOPP - GFX1250 only.
//===----------------------------------------------------------------------===//
+defm S_SET_VGPR_MSB : SOPP_Real_32_gfx12<0x006>;
defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12<0x03e>;
defm S_WAIT_XCNT : SOPP_Real_32_gfx12<0x045>;
defm S_WAIT_ASYNCCNT : SOPP_Real_32_gfx12<0x04a>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index c740b5e0f09d..14ebbf8e9c92 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -116,6 +116,8 @@ static constexpr CustomOperand MsgOperands[] = {
{{"MSG_RTN_GET_TBA"}, ID_RTN_GET_TBA, isGFX11Plus},
{{"MSG_RTN_GET_TBA_TO_PC"}, ID_RTN_GET_TBA_TO_PC, isGFX11Plus},
{{"MSG_RTN_GET_SE_AID_ID"}, ID_RTN_GET_SE_AID_ID, isGFX12Plus},
+ {{"MSG_RTN_GET_CLUSTER_BARRIER_STATE"}, ID_RTN_GET_CLUSTER_BARRIER_STATE,
+ isGFX1250},
};
static constexpr CustomOperand SysMsgOperands[] = {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 18ee9c16b3ff..9f4f42185d9a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -934,6 +934,10 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
if (!OpXRegs[CompOprIdx] || !OpYRegs[CompOprIdx])
continue;
+ if (getVGPREncodingMSBs(OpXRegs[CompOprIdx], MRI) !=
+ getVGPREncodingMSBs(OpYRegs[CompOprIdx], MRI))
+ return CompOprIdx;
+
if (SkipSrc && CompOprIdx >= Component::DST_NUM)
continue;
@@ -1376,6 +1380,9 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
? *EnableWavefrontSize32
: STI->getFeatureBits().test(FeatureWavefrontSize32);
+ if (STI->getFeatureBits().test(Feature1024AddressableVGPRs))
+ return IsWave32 ? 16 : 8;
+
return IsWave32 ? 8 : 4;
}
@@ -1396,7 +1403,10 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
unsigned DynamicVGPRBlockSize) {
- if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ const auto &Features = STI->getFeatureBits();
+ if (Features.test(FeatureGFX1250Insts))
+ return Features.test(FeatureWavefrontSize32) ? 1024 : 512;
+ if (Features.test(FeatureGFX90AInsts))
return 512;
// Temporarily check the subtarget feature, until we fully switch to using
@@ -2720,13 +2730,6 @@ bool isInlineValue(unsigned Reg) {
#undef CASE_GFXPRE11_GFX11PLUS_TO
#undef MAP_REG2REG
-bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
- assert(OpNo < Desc.NumOperands);
- unsigned OpType = Desc.operands()[OpNo].OperandType;
- return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
- OpType <= AMDGPU::OPERAND_SRC_LAST;
-}
-
bool isKImmOperand(const MCInstrDesc &Desc, unsigned OpNo) {
assert(OpNo < Desc.NumOperands);
unsigned OpType = Desc.operands()[OpNo].OperandType;
@@ -2776,6 +2779,7 @@ unsigned getRegBitWidth(unsigned RCID) {
return 16;
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::VGPR_32RegClassID:
+ case AMDGPU::VGPR_32_Lo256RegClassID:
case AMDGPU::VRegOrLds_32RegClassID:
case AMDGPU::AGPR_32RegClassID:
case AMDGPU::VS_32RegClassID:
@@ -2794,6 +2798,8 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_64_Align2RegClassID:
case AMDGPU::AV_64RegClassID:
case AMDGPU::AV_64_Align2RegClassID:
+ case AMDGPU::VReg_64_Lo256_Align2RegClassID:
+ case AMDGPU::VS_64_Lo256RegClassID:
return 64;
case AMDGPU::SGPR_96RegClassID:
case AMDGPU::SReg_96RegClassID:
@@ -2803,6 +2809,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_96_Align2RegClassID:
case AMDGPU::AV_96RegClassID:
case AMDGPU::AV_96_Align2RegClassID:
+ case AMDGPU::VReg_96_Lo256_Align2RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::SReg_128RegClassID:
@@ -2813,6 +2820,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AV_128RegClassID:
case AMDGPU::AV_128_Align2RegClassID:
case AMDGPU::SReg_128_XNULLRegClassID:
+ case AMDGPU::VReg_128_Lo256_Align2RegClassID:
return 128;
case AMDGPU::SGPR_160RegClassID:
case AMDGPU::SReg_160RegClassID:
@@ -2822,6 +2830,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_160_Align2RegClassID:
case AMDGPU::AV_160RegClassID:
case AMDGPU::AV_160_Align2RegClassID:
+ case AMDGPU::VReg_160_Lo256_Align2RegClassID:
return 160;
case AMDGPU::SGPR_192RegClassID:
case AMDGPU::SReg_192RegClassID:
@@ -2831,6 +2840,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_192_Align2RegClassID:
case AMDGPU::AV_192RegClassID:
case AMDGPU::AV_192_Align2RegClassID:
+ case AMDGPU::VReg_192_Lo256_Align2RegClassID:
return 192;
case AMDGPU::SGPR_224RegClassID:
case AMDGPU::SReg_224RegClassID:
@@ -2840,6 +2850,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_224_Align2RegClassID:
case AMDGPU::AV_224RegClassID:
case AMDGPU::AV_224_Align2RegClassID:
+ case AMDGPU::VReg_224_Lo256_Align2RegClassID:
return 224;
case AMDGPU::SGPR_256RegClassID:
case AMDGPU::SReg_256RegClassID:
@@ -2850,6 +2861,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AV_256RegClassID:
case AMDGPU::AV_256_Align2RegClassID:
case AMDGPU::SReg_256_XNULLRegClassID:
+ case AMDGPU::VReg_256_Lo256_Align2RegClassID:
return 256;
case AMDGPU::SGPR_288RegClassID:
case AMDGPU::SReg_288RegClassID:
@@ -2859,6 +2871,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_288_Align2RegClassID:
case AMDGPU::AV_288RegClassID:
case AMDGPU::AV_288_Align2RegClassID:
+ case AMDGPU::VReg_288_Lo256_Align2RegClassID:
return 288;
case AMDGPU::SGPR_320RegClassID:
case AMDGPU::SReg_320RegClassID:
@@ -2868,6 +2881,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_320_Align2RegClassID:
case AMDGPU::AV_320RegClassID:
case AMDGPU::AV_320_Align2RegClassID:
+ case AMDGPU::VReg_320_Lo256_Align2RegClassID:
return 320;
case AMDGPU::SGPR_352RegClassID:
case AMDGPU::SReg_352RegClassID:
@@ -2877,6 +2891,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_352_Align2RegClassID:
case AMDGPU::AV_352RegClassID:
case AMDGPU::AV_352_Align2RegClassID:
+ case AMDGPU::VReg_352_Lo256_Align2RegClassID:
return 352;
case AMDGPU::SGPR_384RegClassID:
case AMDGPU::SReg_384RegClassID:
@@ -2886,6 +2901,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_384_Align2RegClassID:
case AMDGPU::AV_384RegClassID:
case AMDGPU::AV_384_Align2RegClassID:
+ case AMDGPU::VReg_384_Lo256_Align2RegClassID:
return 384;
case AMDGPU::SGPR_512RegClassID:
case AMDGPU::SReg_512RegClassID:
@@ -2895,6 +2911,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_512_Align2RegClassID:
case AMDGPU::AV_512RegClassID:
case AMDGPU::AV_512_Align2RegClassID:
+ case AMDGPU::VReg_512_Lo256_Align2RegClassID:
return 512;
case AMDGPU::SGPR_1024RegClassID:
case AMDGPU::SReg_1024RegClassID:
@@ -2904,6 +2921,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::AReg_1024_Align2RegClassID:
case AMDGPU::AV_1024RegClassID:
case AMDGPU::AV_1024_Align2RegClassID:
+ case AMDGPU::VReg_1024_Lo256_Align2RegClassID:
return 1024;
default:
llvm_unreachable("Unexpected register class");
@@ -3206,8 +3224,11 @@ bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset, bool IsBuffer) {
- if (isGFX12Plus(ST))
+ if (isGFX12Plus(ST)) {
+ if (IsBuffer && EncodedOffset < 0)
+ return false;
return isInt<24>(EncodedOffset);
+ }
return !IsBuffer && hasSMRDSignedImmOffset(ST) && isInt<21>(EncodedOffset);
}
@@ -3321,6 +3342,112 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
: getGfx9BufferFormatInfo(Format);
}
+const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+ const MCRegisterInfo &MRI) {
+ const unsigned VGPRClasses[] = {
+ AMDGPU::VGPR_16RegClassID, AMDGPU::VGPR_32RegClassID,
+ AMDGPU::VReg_64RegClassID, AMDGPU::VReg_96RegClassID,
+ AMDGPU::VReg_128RegClassID, AMDGPU::VReg_160RegClassID,
+ AMDGPU::VReg_192RegClassID, AMDGPU::VReg_224RegClassID,
+ AMDGPU::VReg_256RegClassID, AMDGPU::VReg_288RegClassID,
+ AMDGPU::VReg_320RegClassID, AMDGPU::VReg_352RegClassID,
+ AMDGPU::VReg_384RegClassID, AMDGPU::VReg_512RegClassID,
+ AMDGPU::VReg_1024RegClassID};
+
+ for (unsigned RCID : VGPRClasses) {
+ const MCRegisterClass &RC = MRI.getRegClass(RCID);
+ if (RC.contains(Reg))
+ return &RC;
+ }
+
+ return nullptr;
+}
+
+unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+ unsigned Enc = MRI.getEncodingValue(Reg);
+ unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ return Idx >> 8;
+}
+
+MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
+ const MCRegisterInfo &MRI) {
+ unsigned Enc = MRI.getEncodingValue(Reg);
+ unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ if (Idx >= 0x100)
+ return AMDGPU::NoRegister;
+
+ const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
+ if (!RC)
+ return AMDGPU::NoRegister;
+ return RC->getRegister(Idx | (MSBs << 8));
+}
+
+std::pair<const AMDGPU::OpName *, const AMDGPU::OpName *>
+getVGPRLoweringOperandTables(const MCInstrDesc &Desc) {
+ static const AMDGPU::OpName VOPOps[4] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2,
+ AMDGPU::OpName::vdst};
+ static const AMDGPU::OpName VDSOps[4] = {
+ AMDGPU::OpName::addr, AMDGPU::OpName::data0, AMDGPU::OpName::data1,
+ AMDGPU::OpName::vdst};
+ static const AMDGPU::OpName FLATOps[4] = {
+ AMDGPU::OpName::vaddr, AMDGPU::OpName::vdata,
+ AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdst};
+ static const AMDGPU::OpName BUFOps[4] = {
+ AMDGPU::OpName::vaddr, AMDGPU::OpName::NUM_OPERAND_NAMES,
+ AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdata};
+ static const AMDGPU::OpName VIMGOps[4] = {
+ AMDGPU::OpName::vaddr0, AMDGPU::OpName::vaddr1, AMDGPU::OpName::vaddr2,
+ AMDGPU::OpName::vdata};
+
+ // For VOPD instructions MSB of a corresponding Y component operand VGPR
+ // address is supposed to match X operand, otherwise VOPD shall not be
+ // combined.
+ static const AMDGPU::OpName VOPDOpsX[4] = {
+ AMDGPU::OpName::src0X, AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vsrc2X,
+ AMDGPU::OpName::vdstX};
+ static const AMDGPU::OpName VOPDOpsY[4] = {
+ AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y,
+ AMDGPU::OpName::vdstY};
+
+ unsigned TSFlags = Desc.TSFlags;
+
+ if (TSFlags &
+ (SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 |
+ SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) {
+ // LD_SCALE operands ignore MSB.
+ if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 ||
+ Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 ||
+ Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 ||
+ Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250)
+ return {};
+ return {VOPOps, nullptr};
+ }
+
+ if (TSFlags & SIInstrFlags::DS)
+ return {VDSOps, nullptr};
+
+ if (TSFlags & SIInstrFlags::FLAT)
+ return {FLATOps, nullptr};
+
+ if (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF))
+ return {BUFOps, nullptr};
+
+ if (TSFlags & SIInstrFlags::VIMAGE)
+ return {VIMGOps, nullptr};
+
+ if (AMDGPU::isVOPD(Desc.getOpcode()))
+ return {VOPDOpsX, VOPDOpsY};
+
+ assert(!(TSFlags & SIInstrFlags::MIMG));
+
+ if (TSFlags & (SIInstrFlags::VSAMPLE | SIInstrFlags::EXP))
+ llvm_unreachable("Sample and export VGPR lowering is not implemented and"
+ " these instructions are not expected on gfx1250");
+
+ return {};
+}
+
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) {
uint64_t TSFlags = MII.get(Opcode).TSFlags;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 70dfb63cbe04..3fcd16f9290b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1517,6 +1517,7 @@ constexpr bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return true;
default:
return canGuaranteeTCO(CC);
@@ -1590,7 +1591,14 @@ bool isInlineValue(unsigned Reg);
/// Is this an AMDGPU specific source operand? These include registers,
/// inline constants, literals and mandatory literals (KImm).
-bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
+constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo) {
+ return OpInfo.OperandType >= AMDGPU::OPERAND_SRC_FIRST &&
+ OpInfo.OperandType <= AMDGPU::OPERAND_SRC_LAST;
+}
+
+inline bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+ return isSISrcOperand(Desc.operands()[OpNo]);
+}
/// Is this a KImm operand?
bool isKImmOperand(const MCInstrDesc &Desc, unsigned OpNo);
@@ -1778,6 +1786,25 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
/// \returns true if the intrinsic is uniform
bool isIntrinsicAlwaysUniform(unsigned IntrID);
+/// \returns a register class for the physical register \p Reg if it is a VGPR
+/// or nullptr otherwise.
+const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+ const MCRegisterInfo &MRI);
+
+/// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the
+/// physical register \p Reg.
+unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI);
+
+/// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set.
+MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
+ const MCRegisterInfo &MRI);
+
+// Returns a table for the opcode with a given \p Desc to map the VGPR MSB
+// set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2
+// maps, one for X and one for Y component.
+std::pair<const AMDGPU::OpName *, const AMDGPU::OpName *>
+getVGPRLoweringOperandTables(const MCInstrDesc &Desc);
+
/// \returns true if a memory instruction supports scale_offset modifier.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index fd6253daa327..a7a0e33da5e4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -1061,6 +1061,17 @@ VersionTuple AMDGPUPALMetadata::getPALVersion() {
return VersionTuple(getPALVersion(0), getPALVersion(1));
}
+// Set the field in a given .hardware_stages entry to a maximum value
+void AMDGPUPALMetadata::updateHwStageMaximum(unsigned CC, StringRef field,
+ unsigned Val) {
+ msgpack::MapDocNode HwStageFieldMapNode = getHwStage(CC);
+ auto &Node = HwStageFieldMapNode[field];
+ if (Node.isEmpty())
+ Node = Val;
+ else
+ Node = std::max<unsigned>(Node.getUInt(), Val);
+}
+
// Set the field in a given .hardware_stages entry
void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, unsigned Val) {
getHwStage(CC)[field] = Val;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 4830db5fda50..e50150cc8de9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -156,6 +156,7 @@ public:
unsigned getPALMinorVersion();
VersionTuple getPALVersion();
+ void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val);
void setHwStage(unsigned CC, StringRef field, unsigned Val);
void setHwStage(unsigned CC, StringRef field, bool Val);
void setHwStage(unsigned CC, StringRef field, msgpack::Type Type,
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 11c72751dde5..f816d7de27ee 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -71,7 +71,6 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
let isCodeGenOnly = 0;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
@@ -80,7 +79,6 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
@@ -584,7 +582,6 @@ def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> {
let SubtargetPredicate = isGFX9Plus in {
def V_SWAP_B32 : VOP1_Pseudo<"v_swap_b32", VOP_SWAP_I32, [], 1> {
let Constraints = "$vdst = $src1, $vdst1 = $src0";
- let DisableEncoding = "$vdst1,$src1";
let SchedRW = [Write64Bit, Write64Bit];
}
@@ -802,7 +799,6 @@ let SubtargetPredicate = isGFX10Plus in {
def V_SWAPREL_B32 : VOP1_Pseudo<"v_swaprel_b32", VOP_SWAP_I32, [], 1> {
let Constraints = "$vdst = $src1, $vdst1 = $src0";
- let DisableEncoding = "$vdst1,$src1";
let SchedRW = [Write64Bit, Write64Bit];
}
} // End Uses = [M0]
@@ -831,7 +827,6 @@ def VOP_SWAP_I16 : VOPProfile_True16<VOP_I16_I16> {
let SubtargetPredicate = isGFX11Plus in {
def V_SWAP_B16 : VOP1_Pseudo<"v_swap_b16", VOP_SWAP_I16, [], /* VOP1Only= */true> {
let Constraints = "$vdst = $src1, $vdst1 = $src0";
- let DisableEncoding = "$vdst1, $src1";
let SchedRW = [Write64Bit, Write64Bit];
let True16Predicate = UseRealTrue16Insts;
}
@@ -849,7 +844,6 @@ let SubtargetPredicate = HasPrngInst in
defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>;
let Constraints = "$vdst = $vdst_in, $src0_out = $src0",
- DisableEncoding="$vdst_in,$src0_out",
SchedRW = [Write32Bit, Write32Bit],
isConvergent = 1 in {
let SubtargetPredicate = HasPermlane16Swap in {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 9de7d6d009fe..cff66aaedb11 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -105,7 +105,6 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
let isCodeGenOnly = 0;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
@@ -114,7 +113,6 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
@@ -418,12 +416,12 @@ def VOP_MADAK_F16_t16 : VOP_MADAK <f16> {
let IsTrue16 = 1;
let IsRealTrue16 = 1;
let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret;
- let Ins32 = (ins VSrcT_f16_Lo128:$src0, VGPRSrc_16_Lo128:$src1, ImmOpType:$imm);
+ let Ins32 = (ins VSrcT_f16_Lo128:$src0, VGPROp_16_Lo128:$src1, ImmOpType:$imm);
}
def VOP_MADAK_F16_fake16 : VOP_MADAK <f16> {
let IsTrue16 = 1;
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
- let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, VGPRSrc_32_Lo128:$src1, ImmOpType:$imm);
+ let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, VGPROp_32_Lo128:$src1, ImmOpType:$imm);
}
def VOP_MADAK_F32 : VOP_MADAK <f32>;
def VOP_MADAK_F64 : VOP_MADAK <f64>;
@@ -454,12 +452,12 @@ def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
let IsTrue16 = 1;
let IsRealTrue16 = 1;
let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret;
- let Ins32 = (ins VSrcT_f16_Lo128:$src0, ImmOpType:$imm, VGPRSrc_16_Lo128:$src1);
+ let Ins32 = (ins VSrcT_f16_Lo128:$src0, ImmOpType:$imm, VGPROp_16_Lo128:$src1);
}
def VOP_MADMK_F16_fake16 : VOP_MADMK <f16> {
let IsTrue16 = 1;
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
- let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, ImmOpType:$imm, VGPRSrc_32_Lo128:$src1);
+ let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, ImmOpType:$imm, VGPROp_32_Lo128:$src1);
}
def VOP_MADMK_F32 : VOP_MADMK <f32>;
def VOP_MADMK_F64 : VOP_MADMK <f64>;
@@ -498,14 +496,14 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
HasClamp, HasModifiers, HasModifiers, HasOMod,
Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret;
// We need a dummy src2 tied to dst to track the use of that register for s_delay_alu
- let InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X, VGPRSrc_32:$src2X);
- let InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y, VGPRSrc_32:$src2Y);
+ let InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X, VGPROp_32:$src2X);
+ let InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y, VGPROp_32:$src2Y);
let InsVOPD3X = (ins Src0ModVOPD3:$src0X_modifiers, Src0VOPD3:$src0X,
Src1ModVOPD3:$vsrc1X_modifiers, Src1RC32:$vsrc1X,
- VGPRSrc_32:$src2X);
+ VGPROp_32:$src2X);
let InsVOPD3Y = (ins Src0ModVOPD3:$src0Y_modifiers, Src0VOPD3:$src0Y,
Src1ModVOPD3:$vsrc1Y_modifiers, Src1RC32:$vsrc1Y,
- VGPRSrc_32:$src2Y);
+ VGPROp_32:$src2Y);
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
@@ -567,7 +565,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
let DstRC64 = getVALUDstForVT<DstVT, 1/*IsTrue*/, 1/*IsVOP3Encoding*/>.ret;
let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret;
let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_16;
+ let Src0VOP3DPP = VGPROp_16;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
@@ -599,7 +597,7 @@ def VOP_MAC_F16_fake16 : VOP_MAC <f16> {
getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
dpp8:$dpp8, Dpp8FI:$fi);
let DstRC64 = getVALUDstForVT<DstVT>.ret;
- let Src0VOP3DPP = VGPRSrc_32;
+ let Src0VOP3DPP = VGPROp_32;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
@@ -798,7 +796,7 @@ def VOP2e_I16_I16_I16_I1_true16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
Src2RC64, NumSrcArgs,
HasClamp, 1/*HasModifiers*/, 0/*HasSrc2Mods*/, HasOMod,
Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/>.ret;
- let Src0VOP3DPP = VGPRSrc_16;
+ let Src0VOP3DPP = VGPROp_16;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 0/*IsFake16*/>.ret;
let Src1ModVOP3DPP = getSrcModVOP3VC<f16, 0/*IsFake16*/>.ret;
@@ -810,7 +808,7 @@ def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
let Src0Mod = getSrc0Mod<f16, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1Mod = getSrcMod<f16, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_32;
+ let Src0VOP3DPP = VGPROp_32;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 1/*IsFake16*/>.ret;
let Src1ModVOP3DPP = getSrcModVOP3VC<f16, 1/*IsFake16*/>.ret;
@@ -889,13 +887,13 @@ defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
let mayRaiseFPException = 0 in {
let OtherPredicates = [HasMadMacF32Insts] in {
-let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+let Constraints = "$vdst = $src2",
isConvertibleToThreeAddress = 1 in {
defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
let SubtargetPredicate = isGFX6GFX7GFX10 in
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_MAC_LEGACY_F32>;
-} // End Constraints = "$vdst = $src2", DisableEncoding="$src2",
+} // End Constraints = "$vdst = $src2",
// isConvertibleToThreeAddress = 1
let isReMaterializable = 1 in
@@ -941,9 +939,9 @@ defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>;
// These are special and do not read the exec mask.
let isConvergent = 1, Uses = []<Register> in {
def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>;
-let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in" in {
def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>;
-} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
+} // End IsNeverUniform, $vdst = $vdst_in
} // End isConvergent = 1
foreach vt = Reg32Types.types in {
@@ -1175,7 +1173,6 @@ let True16Predicate = UseFakeTrue16Insts in {
} // End FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1
let Constraints = "$vdst = $src2",
- DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1 in {
let SubtargetPredicate = isGFX10Plus in {
@@ -1209,7 +1206,7 @@ let SubtargetPredicate = isGFX8GFX9 in {
} // End isReMaterializable = 1
// FIXME: Missing FPDPRounding
-let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+let Constraints = "$vdst = $src2",
isConvertibleToThreeAddress = 1, isCommutable = 1 in {
defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
}
@@ -1252,7 +1249,6 @@ def : GCNPat<
>;
let Constraints = "$vdst = $src2",
- DisableEncoding = "$src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1 in
defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">;
@@ -1261,7 +1257,6 @@ defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">;
let SubtargetPredicate = HasFmaLegacy32 in {
let Constraints = "$vdst = $src2",
- DisableEncoding = "$src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1 in
defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
@@ -1270,14 +1265,12 @@ defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
let SubtargetPredicate = HasFmacF64Inst,
Constraints = "$vdst = $src2",
- DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1,
SchedRW = [WriteDoubleAdd] in
defm V_FMAC_F64 : VOP2Inst <"v_fmac_f64", VOP_MAC_F64>;
let Constraints = "$vdst = $src2",
- DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1,
IsDOT = 1 in {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 329d003cf250..19eabb46752b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -782,7 +782,7 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", V_LSHL_ADD_U64_PROF>;
let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
SchedRW = [WriteFloatCvt] in {
- let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in" in {
let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile<>,
VOP3_CVT_PK_F8_F32_Profile_t16<>,
@@ -807,7 +807,7 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
// These instructions have non-standard use of op_sel. In particular they are
// using op_sel bits 2 and 3 while only having two sources. Therefore dummy
// src2 is used to hold the op_sel value.
- let Constraints = "$vdst = $src2", DisableEncoding = "$src2", SubtargetPredicate = isGFX940Plus in {
+ let Constraints = "$vdst = $src2", SubtargetPredicate = isGFX940Plus in {
defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
}
@@ -1309,7 +1309,7 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
}
let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
- let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in" in {
defm V_CVT_SCALEF32_SR_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_fp8_bf16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_BF16_I32_F32>>;
defm V_CVT_SCALEF32_SR_FP8_F16 : VOP3Inst<"v_cvt_scalef32_sr_fp8_f16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F16_I32_F32>>;
defm V_CVT_SCALEF32_SR_FP8_F32 : VOP3Inst<"v_cvt_scalef32_sr_fp8_f32", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F32_I32_F32>>;
@@ -1325,7 +1325,7 @@ let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in
}
let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in {
- let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in" in {
defm V_CVT_SCALEF32_SR_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_bf8_bf16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_BF16_I32_F32>>;
defm V_CVT_SCALEF32_SR_BF8_F16 : VOP3Inst<"v_cvt_scalef32_sr_bf8_f16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F16_I32_F32>>;
defm V_CVT_SCALEF32_SR_BF8_F32 : VOP3Inst<"v_cvt_scalef32_sr_bf8_f32", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F32_I32_F32>>;
@@ -1342,7 +1342,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in
let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in {
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
- let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in" in {
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
@@ -1358,7 +1358,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
// These instructions have non-standard use of op_sel. In particular they are
// using op_sel bits 2 and 3 while only having two sources.
- let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+ let Constraints = "$vdst = $src2" in {
defm V_CVT_SCALEF32_PK_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f16", VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOP_I32_V2F16_F32_F32>>;
defm V_CVT_SCALEF32_PK_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk_fp4_bf16", VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOP_I32_V2BF16_F32_F32>>;
}
@@ -1486,10 +1486,10 @@ let SubtargetPredicate = isGFX10Plus in {
} // End isCommutable = 1, isReMaterializable = 1
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
- let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", isConvergent = 1 in {
+ let Constraints = "$vdst = $vdst_in", isConvergent = 1 in {
defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
- } // End $vdst = $vdst_in, DisableEncoding $vdst_in, isConvergent = 1
+ } // End $vdst = $vdst_in, isConvergent = 1
foreach vt = Reg32Types.types in {
def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
@@ -1532,10 +1532,10 @@ let True16Predicate = UseFakeTrue16Insts in {
} // End True16Predicate = UseFakeTrue16Insts
let SubtargetPredicate = isGFX12Plus in {
- let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in" in {
defm V_PERMLANE16_VAR_B32 : VOP3Inst<"v_permlane16_var_b32", VOP3_PERMLANE_VAR_Profile>;
defm V_PERMLANEX16_VAR_B32 : VOP3Inst<"v_permlanex16_var_b32", VOP3_PERMLANE_VAR_Profile>;
- } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+ } // End $vdst = $vdst_in
def : PermlaneVarPat<int_amdgcn_permlane16_var, V_PERMLANE16_VAR_B32_e64>;
def : PermlaneVarPat<int_amdgcn_permlanex16_var, V_PERMLANEX16_VAR_B32_e64>;
@@ -1763,7 +1763,7 @@ let SubtargetPredicate = isGFX1250Plus in {
// These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel
// to select a byte in the vdst. Bits 0 and 1 are unused.
- let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in" in {
defm V_CVT_SR_FP8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_fp8_f16", VOP3_CVT_SR_F8_F16_Profile,
VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>;
defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile,
@@ -1850,7 +1850,7 @@ class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, Va
>;
let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in {
- let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in" in {
defm V_CVT_SR_F16_F32 : VOP3Inst<"v_cvt_sr_f16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_F16_F32_I32>>;
defm V_CVT_SR_BF16_F32 : VOP3Inst<"v_cvt_sr_bf16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_BF16_F32_I32>>;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index ce280d484da1..6f778a0d262a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -44,7 +44,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
FP16InputMods:$src1_modifiers, Src1RC:$src1,
FP16InputMods:$src2_modifiers, Src2RC:$src2);
dag dpp_srcs =
- (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
+ (ins FPVRegInputMods:$src0_modifiers, VGPROp_32:$src0,
FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1,
FP16InputMods:$src2_modifiers, Src2RC:$src2);
@@ -84,7 +84,6 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
def NAME : VOP3P_Pseudo<OpName, P> {
let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
- let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
}
let SubtargetPredicate = isGFX11Plus in {
if P.HasExtVOP3DPP then
@@ -92,7 +91,6 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
let VOP3P = 1;
let PseudoInstr = OpName#"_dpp";
let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
- let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
}
} // end SubtargetPredicate = isGFX11Plus
}
@@ -1166,7 +1164,7 @@ let SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 in {
} // End SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1
multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
- let Constraints = "$vdst = $src2", DisableEncoding = "$src2",
+ let Constraints = "$vdst = $src2",
isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1, is_gfx940_xdl = 1 in {
def _e64 : MAIInst<OpName, !cast<VOPProfileSMFMAC>("VOPProfileSMFMAC_" # P), node>;
}
@@ -1520,8 +1518,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
(ins));
dag MatrixScaleSrc = !if(HasMatrixScale,
- !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1),
- (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)),
+ !if(Scale16, (ins VCSrc_b64_Lo256:$scale_src0, VCSrc_b64_Lo256:$scale_src1),
+ (ins VCSrc_b32_Lo256:$scale_src0, VCSrc_b32_Lo256:$scale_src1)),
(ins));
dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale,
MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),
@@ -1859,8 +1857,8 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16
defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
} // End is_wmma_xdl = 1.
-defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64>>;
+defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
+defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
} // End SubtargetPredicate = isGFX125xOnly
} // End WaveSizePredicate = isWave32
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 2c1193509da9..2730ec52294e 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -100,7 +100,7 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_16;
+ let Src0VOP3DPP = VGPROp_16;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
@@ -126,7 +126,7 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_32;
+ let Src0VOP3DPP = VGPROp_32;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
@@ -173,7 +173,7 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_16;
+ let Src0VOP3DPP = VGPROp_16;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
@@ -197,7 +197,7 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_32;
+ let Src0VOP3DPP = VGPROp_32;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
@@ -251,7 +251,6 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.Pseudo
let isCodeGenOnly = 0;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
@@ -259,7 +258,6 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.Pseudo
let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
@@ -894,7 +892,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
// DPP8 forbids modifiers and can inherit from VOPC_Profile
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
- dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VCSrc_b32:$src1);
+ dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPROp_32:$src0, VCSrc_b32:$src1);
let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
(ins)));
let AsmVOP3Base = "$sdst, $src0_modifiers, $src1";
@@ -917,7 +915,7 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_16;
+ let Src0VOP3DPP = VGPROp_16;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
@@ -943,7 +941,7 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_32;
+ let Src0VOP3DPP = VGPROp_32;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
@@ -987,7 +985,7 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> {
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_16;
+ let Src0VOP3DPP = VGPROp_16;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
@@ -1011,7 +1009,7 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> {
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
- let Src0VOP3DPP = VGPRSrc_32;
+ let Src0VOP3DPP = VGPROp_32;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOPDInstructions.td b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
index 3e7af12f6b60..f416c0654048 100644
--- a/llvm/lib/Target/AMDGPU/VOPDInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
@@ -138,10 +138,6 @@ class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
string ConstraintsY = !if(hasSrc2AccY, "$src2Y = $vdstY", "");
let Constraints =
ConstraintsX # !if(!and(hasSrc2AccX, hasSrc2AccY), ", ", "") # ConstraintsY;
- string DisableEncodingX = !if(hasSrc2AccX, "$src2X", "");
- string DisableEncodingY = !if(hasSrc2AccY, "$src2Y", "");
- let DisableEncoding =
- DisableEncodingX # !if(!and(hasSrc2AccX, hasSrc2AccY), ", ", "") # DisableEncodingY;
let Uses = RegListUnion<VDX.Uses, VDY.Uses>.ret;
let Defs = RegListUnion<VDX.Defs, VDY.Defs>.ret;
@@ -228,7 +224,7 @@ foreach Gen = [GFX11GenD, GFX12GenD, GFX1250GenD] in {
defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"));
defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2) # Gen.Suffix;
- defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY);
+ defvar outs = (outs VGPROp_32:$vdstX, VOPDDstYOperand:$vdstY);
if !or(isOpXMADK, isOpYMADK) then {
// If Both X and Y are MADK, the mandatory literal of X additionally must
// use an alternate operand format which defers to the 'real' Y literal.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 3cad5a1c2c37..5550a0c08b91 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -187,7 +187,6 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
@@ -807,7 +806,6 @@ class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> :
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
// Copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
@@ -817,7 +815,6 @@ class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> :
let UseNamedOperandTable = ps.UseNamedOperandTable;
let DecoderNamespace = ps.DecoderNamespace;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let Uses = ps.Uses;
let Defs = ps.Defs;
@@ -841,7 +838,6 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let SubtargetPredicate = ps.SubtargetPredicate;
let AssemblerPredicate = HasSDWA9;
@@ -854,7 +850,6 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
let AsmMatchConverter = ps.AsmMatchConverter;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let Uses = ps.Uses;
let Defs = ps.Defs;
@@ -1037,7 +1032,6 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
- let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
let DecoderNamespace = "GFX8";
}
@@ -1066,7 +1060,6 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
// Copy relevant pseudo op flags
let isConvergent = ps.isConvergent;
@@ -1079,7 +1072,6 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
let UseNamedOperandTable = ps.UseNamedOperandTable;
let DecoderNamespace = ps.DecoderNamespace;
let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let Uses = ps.Uses;
let Defs = ps.Defs;
@@ -1109,7 +1101,6 @@ class VOP_DPP_Base <string OpName, VOPProfile P,
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
- let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
let DecoderNamespace = "GFX8";
}
@@ -1228,7 +1219,6 @@ class VOP_DPP8_Base<string OpName, VOPProfile P, dag InsDPP8 = P.InsDPP8, string
let AssemblerPredicate = HasDPP8;
let AsmVariantName = AMDGPUAsmVariants.DPP;
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
- let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
}
class VOP_DPP8<string OpName, VOPProfile P> :
diff --git a/llvm/lib/Target/ARC/ARCInstrFormats.td b/llvm/lib/Target/ARC/ARCInstrFormats.td
index bd2ed0057617..0560bb1dc966 100644
--- a/llvm/lib/Target/ARC/ARCInstrFormats.td
+++ b/llvm/lib/Target/ARC/ARCInstrFormats.td
@@ -964,12 +964,10 @@ class F16_OP_U7<bit i, string asmstr> :
// Special types for different instruction operands.
def ccond : Operand<i32> {
- let MIOperandInfo = (ops i32imm);
let PrintMethod = "printPredicateOperand";
}
def brccond : Operand<i32> {
- let MIOperandInfo = (ops i32imm);
let PrintMethod = "printBRCCPredicateOperand";
}
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
index 8a89bdb546f3..05bcb3596ac4 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
@@ -44,7 +44,7 @@ enum TSFlagsConstants {
void ARCInstrInfo::anchor() {}
ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST)
- : ARCGenInstrInfo(ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {}
+ : ARCGenInstrInfo(ST, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {}
static bool isZeroImm(const MachineOperand &Op) {
return Op.isImm() && Op.getImm() == 0;
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.td b/llvm/lib/Target/ARC/ARCInstrInfo.td
index f26b49119cab..8ff5f4a39ca7 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.td
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.td
@@ -18,7 +18,7 @@ include "ARCInstrFormats.td"
// Operand for printing out a condition code.
let PrintMethod = "printCCOperand" in
- def CCOp : PredicateOperand<i32, (ops i32imm), (ops)>;
+ def CCOp : PredicateOperand<i32, (ops), (ops)>;
// The "u6" operand of a RRU6-type instruction
let PrintMethod = "printU6" in {
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9e4dbecc16a8..5c35b3327c16 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -107,9 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = {
{ ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true },
};
-ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
- : ARMGenInstrInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
- Subtarget(STI) {
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI)
+ : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+ Subtarget(STI) {
for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) {
if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
llvm_unreachable("Duplicated entries?");
@@ -6730,7 +6730,7 @@ bool ARMPipelinerLoopInfo::tooMuchRegisterPressure(SwingSchedulerDAG &SSD,
Register Reg = S.getReg();
auto CIter = CrossIterationNeeds.find(Reg.id());
if (CIter != CrossIterationNeeds.end()) {
- auto Stg2 = SMS.stageScheduled(const_cast<SUnit *>(S.getSUnit()));
+ auto Stg2 = SMS.stageScheduled(S.getSUnit());
assert(Stg2 <= Stg && "Data dependence upon earlier stage");
if (Stg - Stg2 < MAX_STAGES)
CIter->second.set(Stg - Stg2);
diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
index ec907995e3ab..3d8ebfeae81d 100644
--- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
+++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -218,7 +218,7 @@ bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
return false;
LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n");
MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
- TII = static_cast<const ARMBaseInstrInfo *>(ST.getInstrInfo());
+ TII = ST.getInstrInfo();
BBUtils = std::make_unique<ARMBasicBlockUtils>(MF);
MF.RenumberBlocks();
BBUtils->computeAllBlockSizes();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 12d2d678ff63..d4d3c7009527 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -529,56 +529,56 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
const RTLIB::LibcallImpl Impl;
} LibraryCalls[] = {
// Single-precision floating-point arithmetic.
- { RTLIB::ADD_F32, RTLIB::__addsf3vfp },
- { RTLIB::SUB_F32, RTLIB::__subsf3vfp },
- { RTLIB::MUL_F32, RTLIB::__mulsf3vfp },
- { RTLIB::DIV_F32, RTLIB::__divsf3vfp },
+ { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
+ { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
+ { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
+ { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
// Double-precision floating-point arithmetic.
- { RTLIB::ADD_F64, RTLIB::__adddf3vfp },
- { RTLIB::SUB_F64, RTLIB::__subdf3vfp },
- { RTLIB::MUL_F64, RTLIB::__muldf3vfp },
- { RTLIB::DIV_F64, RTLIB::__divdf3vfp },
+ { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
+ { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
+ { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
+ { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
// Single-precision comparisons.
- { RTLIB::OEQ_F32, RTLIB::__eqsf2vfp },
- { RTLIB::UNE_F32, RTLIB::__nesf2vfp },
- { RTLIB::OLT_F32, RTLIB::__ltsf2vfp },
- { RTLIB::OLE_F32, RTLIB::__lesf2vfp },
- { RTLIB::OGE_F32, RTLIB::__gesf2vfp },
- { RTLIB::OGT_F32, RTLIB::__gtsf2vfp },
- { RTLIB::UO_F32, RTLIB::__unordsf2vfp },
+ { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
+ { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
+ { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
+ { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
+ { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
+ { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
+ { RTLIB::UO_F32, RTLIB::impl___unordsf2vfp },
// Double-precision comparisons.
- { RTLIB::OEQ_F64, RTLIB::__eqdf2vfp },
- { RTLIB::UNE_F64, RTLIB::__nedf2vfp },
- { RTLIB::OLT_F64, RTLIB::__ltdf2vfp },
- { RTLIB::OLE_F64, RTLIB::__ledf2vfp },
- { RTLIB::OGE_F64, RTLIB::__gedf2vfp },
- { RTLIB::OGT_F64, RTLIB::__gtdf2vfp },
- { RTLIB::UO_F64, RTLIB::__unorddf2vfp },
+ { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
+ { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
+ { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
+ { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
+ { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
+ { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
+ { RTLIB::UO_F64, RTLIB::impl___unorddf2vfp },
// Floating-point to integer conversions.
// i64 conversions are done via library routines even when generating VFP
// instructions, so use the same ones.
- { RTLIB::FPTOSINT_F64_I32, RTLIB::__fixdfsivfp },
- { RTLIB::FPTOUINT_F64_I32, RTLIB::__fixunsdfsivfp },
- { RTLIB::FPTOSINT_F32_I32, RTLIB::__fixsfsivfp },
- { RTLIB::FPTOUINT_F32_I32, RTLIB::__fixunssfsivfp },
+ { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
+ { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
+ { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
+ { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
// Conversions between floating types.
- { RTLIB::FPROUND_F64_F32, RTLIB::__truncdfsf2vfp },
- { RTLIB::FPEXT_F32_F64, RTLIB::__extendsfdf2vfp },
+ { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
+ { RTLIB::FPEXT_F32_F64, RTLIB::impl___extendsfdf2vfp },
// Integer to floating-point conversions.
// i64 conversions are done via library routines even when generating VFP
// instructions, so use the same ones.
// FIXME: There appears to be some naming inconsistency in ARM libgcc:
// e.g., __floatunsidf vs. __floatunssidfvfp.
- { RTLIB::SINTTOFP_I32_F64, RTLIB::__floatsidfvfp },
- { RTLIB::UINTTOFP_I32_F64, RTLIB::__floatunssidfvfp },
- { RTLIB::SINTTOFP_I32_F32, RTLIB::__floatsisfvfp },
- { RTLIB::UINTTOFP_I32_F32, RTLIB::__floatunssisfvfp },
+ { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
+ { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
+ { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
+ { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
};
// clang-format on
@@ -3403,7 +3403,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
// position-independent addressing modes.
if (Subtarget->genExecuteOnly()) {
auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
- auto T = const_cast<Type*>(CP->getType());
+ auto *T = CP->getType();
auto C = const_cast<Constant*>(CP->getConstVal());
auto M = DAG.getMachineFunction().getFunction().getParent();
auto GV = new GlobalVariable(
@@ -5570,7 +5570,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
llvm_unreachable("Unknown VFP cmp argument!");
}
-/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
+/// OptimizeVFPBrcond - With nnan, it's legal to optimize some
/// f32 and even f64 comparisons to integer ones.
SDValue
ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
@@ -5712,9 +5712,12 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
}
- if (getTargetMachine().Options.UnsafeFPMath &&
- (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
- CC == ISD::SETNE || CC == ISD::SETUNE)) {
+ SDNodeFlags Flags = Op->getFlags();
+ if ((getTargetMachine().Options.UnsafeFPMath || Flags.hasNoNaNs()) &&
+ (DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
+ DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE()) &&
+ (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
+ CC == ISD::SETUNE)) {
if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
return Result;
}
@@ -10539,19 +10542,11 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
}
// Generate the operation with flags
- SDValue OpWithFlags;
- if (Opcode == ARMISD::ADDC) {
- // Use ADDC: LHS + RHS (where RHS was 0 - X, now X)
- OpWithFlags = DAG.getNode(ARMISD::ADDC, dl,
- DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
- } else {
- // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
- OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
- DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
- }
+ SDValue OpWithFlags =
+ DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
- SDValue OpResult = OpWithFlags.getValue(0); // The operation result
- SDValue Flags = OpWithFlags.getValue(1); // The flags
+ SDValue OpResult = OpWithFlags.getValue(0);
+ SDValue Flags = OpWithFlags.getValue(1);
// Constants for conditional moves
SDValue One = DAG.getConstant(1, dl, MVT::i32);
@@ -20073,6 +20068,29 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = KnownOp0.intersectWith(KnownOp1);
break;
}
+ case ARMISD::VORRIMM:
+ case ARMISD::VBICIMM: {
+ unsigned Encoded = Op.getConstantOperandVal(1);
+ unsigned DecEltBits = 0;
+ uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
+
+ unsigned EltBits = Op.getScalarValueSizeInBits();
+ if (EltBits != DecEltBits) {
+ // Be conservative: only update Known when EltBits == DecEltBits.
+ // This is believed to always be true for VORRIMM/VBICIMM today, but if
+ // that changes in the future, doing nothing here is safer than risking
+ // subtle bugs.
+ break;
+ }
+
+ KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+ bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
+ APInt Imm(DecEltBits, DecodedVal);
+
+ Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
+ Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
+ break;
+ }
}
}
@@ -20200,37 +20218,6 @@ bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
// ARM Inline Assembly Support
//===----------------------------------------------------------------------===//
-bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
- // Looking for "rev" which is V6+.
- if (!Subtarget->hasV6Ops())
- return false;
-
- InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
- StringRef AsmStr = IA->getAsmString();
- SmallVector<StringRef, 4> AsmPieces;
- SplitString(AsmStr, AsmPieces, ";\n");
-
- switch (AsmPieces.size()) {
- default: return false;
- case 1:
- AsmStr = AsmPieces[0];
- AsmPieces.clear();
- SplitString(AsmStr, AsmPieces, " \t,");
-
- // rev $0, $1
- if (AsmPieces.size() == 3 && AsmPieces[0] == "rev" &&
- AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
- IA->getConstraintString().starts_with("=l,l")) {
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (Ty && Ty->getBitWidth() == 32)
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
- break;
- }
-
- return false;
-}
-
const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
// At this point, we have to lower this constraint to something else, so we
// lower it to an "r" or "w". However, by doing this we will force the result
@@ -21379,12 +21366,25 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
return false;
}
+bool ARMTargetLowering::canCreateUndefOrPoisonForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
+ unsigned Opcode = Op.getOpcode();
+ switch (Opcode) {
+ case ARMISD::VORRIMM:
+ case ARMISD::VBICIMM:
+ return false;
+ }
+ return TargetLowering::canCreateUndefOrPoisonForTargetNode(
+ Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
+}
+
bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
- return Subtarget->hasV6T2Ops();
+ return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
}
bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
- return Subtarget->hasV6T2Ops();
+ return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
}
bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(
@@ -21706,13 +21706,15 @@ bool ARMTargetLowering::lowerInterleavedLoad(
bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
- unsigned Factor) const {
+ unsigned Factor,
+ const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
- assert(!LaneMask && "Unexpected mask on store");
+ assert(!LaneMask && GapMask.popcount() == Factor &&
+ "Unexpected mask on store");
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 0185c8ddd492..ccf6d509313b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -534,8 +534,6 @@ class VectorType;
const APInt &DemandedElts,
TargetLoweringOpt &TLO) const override;
- bool ExpandInlineAsm(CallInst *CI) const override;
-
ConstraintType getConstraintType(StringRef Constraint) const override;
/// Examine constraint string and operand type and determine a weight value.
@@ -688,8 +686,8 @@ class VectorType;
ArrayRef<unsigned> Indices, unsigned Factor,
const APInt &GapMask) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
- ShuffleVectorInst *SVI,
- unsigned Factor) const override;
+ ShuffleVectorInst *SVI, unsigned Factor,
+ const APInt &GapMask) const override;
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
TargetLoweringBase::AtomicExpansionKind
@@ -709,6 +707,10 @@ class VectorType;
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const override;
+ bool canCreateUndefOrPoisonForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override;
+
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
const MachineFunction &MF) const override {
// Do not merge to larger than i32.
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 9eb911406914..e50740f7d57c 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -160,7 +160,7 @@ def CondCodeOperand : AsmOperandClass {
let DefaultMethod = "defaultCondCodeOp";
let IsOptional = true;
}
-def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
+def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
(ops (i32 14), (i32 zero_reg))> {
let PrintMethod = "printPredicateOperand";
let ParserMatchClass = CondCodeOperand;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 934ec52c6f1e..bdb16d7d3926 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -164,10 +164,9 @@ def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>;
def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntUnaryOp>;
def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
- [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+ [SDNPHasChain, SDNPOutGlue]>;
def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd,
- [SDNPHasChain, SDNPSideEffect,
- SDNPOptInGlue, SDNPOutGlue]>;
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
SDT_ARMStructByVal,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
@@ -766,7 +765,6 @@ class MVEVectorIndexOperand<int NumLanes> : AsmOperandClass {
class MVEVectorIndex<int NumLanes> : Operand<i32> {
let PrintMethod = "printVectorIndex";
let ParserMatchClass = MVEVectorIndexOperand<NumLanes>;
- let MIOperandInfo = (ops i32imm);
}
// shift_imm: An integer that encodes a shift amount and the type of shift
@@ -1182,7 +1180,6 @@ def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; }
def postidx_imm8 : MemOperand {
let PrintMethod = "printPostIdxImm8Operand";
let ParserMatchClass = PostIdxImm8AsmOperand;
- let MIOperandInfo = (ops i32imm);
}
// postidx_imm8s4 := +/- [0,1020]
@@ -6448,7 +6445,7 @@ def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
(RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;
// Pre-v6, 'mov r0, r0' was used as a NOP encoding.
-def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg), 0>,
+def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, (cc_out zero_reg)), 0>,
Requires<[IsARM, NoV6]>;
// MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 7485ef569445..37f0103363b9 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -95,28 +95,24 @@ def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{
}]> {
let ParserMatchClass = VectorIndex8Operand;
let PrintMethod = "printVectorIndex";
- let MIOperandInfo = (ops i32imm);
}
def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{
return ((uint64_t)Imm) < 4;
}]> {
let ParserMatchClass = VectorIndex16Operand;
let PrintMethod = "printVectorIndex";
- let MIOperandInfo = (ops i32imm);
}
def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{
return ((uint64_t)Imm) < 2;
}]> {
let ParserMatchClass = VectorIndex32Operand;
let PrintMethod = "printVectorIndex";
- let MIOperandInfo = (ops i32imm);
}
def VectorIndex64 : Operand<i32>, ImmLeaf<i32, [{
return ((uint64_t)Imm) < 1;
}]> {
let ParserMatchClass = VectorIndex64Operand;
let PrintMethod = "printVectorIndex";
- let MIOperandInfo = (ops i32imm);
}
// Register list of one D register.
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index e38cafdf55c4..0c5ea3e0fa8d 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1209,8 +1209,9 @@ def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255_expr:$imm8), IIC_iMOVi,
}
// Because we have an explicit tMOVSr below, we need an alias to handle
// the immediate "movs" form here. Blech.
-def : tInstAlias <"movs $Rdn, $imm8",
- (tMOVi8 tGPR:$Rdn, CPSR, imm0_255_expr:$imm8, 14, zero_reg)>;
+def : tInstAlias<"movs $Rdn, $imm8",
+ (tMOVi8 tGPR:$Rdn, (s_cc_out CPSR),
+ imm0_255_expr:$imm8, (pred 14, zero_reg))>;
// A7-73: MOV(2) - mov setting flag.
@@ -1764,7 +1765,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
// encoding is available on ARMv6K, but we don't differentiate that finely.
-def : InstAlias<"nop", (tMOVr R8, R8, 14, zero_reg), 0>, Requires<[IsThumb, IsThumb1Only]>;
+def : InstAlias<"nop", (tMOVr R8, R8, (pred 14, zero_reg)), 0>,
+ Requires<[IsThumb, IsThumb1Only]>;
// "neg" is and alias for "rsb rd, rn, #0"
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 8f56fb0938dd..c00d616670b5 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -2222,11 +2222,11 @@ def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rm), IIC_iMOVr,
let Inst{7-4} = 0b0000;
}
def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
- pred:$p, zero_reg)>;
+ pred:$p, (cc_out zero_reg))>;
def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
- pred:$p, CPSR)>;
+ pred:$p, (cc_out CPSR))>;
def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
- pred:$p, CPSR)>;
+ pred:$p, (cc_out CPSR))>;
// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
@@ -2244,14 +2244,14 @@ def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
// cc_out is handled as part of the explicit mnemonic in the parser for 'mov'.
// Use aliases to get that to play nice here.
def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
- pred:$p, CPSR)>;
+ pred:$p, (cc_out CPSR))>;
def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
- pred:$p, CPSR)>;
+ pred:$p, (cc_out CPSR))>;
def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
- pred:$p, zero_reg)>;
+ pred:$p, (cc_out zero_reg))>;
def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
- pred:$p, zero_reg)>;
+ pred:$p, (cc_out zero_reg))>;
let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
@@ -5122,8 +5122,10 @@ def : InstAlias<"isb${p}.w", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
// Non-predicable aliases of a predicable DSB: the predicate is (14, zero_reg) where
// 14 = AL (always execute) and zero_reg = "instruction doesn't read the CPSR".
-def : InstAlias<"ssbb", (t2DSB 0x0, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>;
-def : InstAlias<"pssbb", (t2DSB 0x4, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>;
+def : InstAlias<"ssbb", (t2DSB 0x0, (pred 14, zero_reg)), 1>,
+ Requires<[HasDB, IsThumb2]>;
+def : InstAlias<"pssbb", (t2DSB 0x4, (pred 14, zero_reg)), 1>,
+ Requires<[HasDB, IsThumb2]>;
// Armv8-R 'Data Full Barrier'
def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;
@@ -5340,7 +5342,8 @@ def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
// "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like
// for isel.
def : t2InstSubst<"mov${p} $Rd, $imm",
- (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+ (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p,
+ (cc_out zero_reg))>;
def : t2InstSubst<"mvn${s}${p} $Rd, $imm",
(t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
// Same for AND <--> BIC
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index fc12f050fa5a..cdff649ecfa5 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -206,7 +206,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) : ST(ST) {
getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64});
- if (ST.hasV5TOps()) {
+ if (ST.hasV5TOps() && !ST.isThumb1Only()) {
getActionDefinitionsBuilder(G_CTLZ)
.legalFor({s32, s32})
.clampScalar(1, s32, s32)
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 6b2854171c81..9b250e6cac3a 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1916,9 +1916,11 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
}
InstructionCost
-ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
- VectorType *ValTy,
+ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
+ Type *ResTy, VectorType *ValTy,
TTI::TargetCostKind CostKind) const {
+ if (RedOpcode != Instruction::Add)
+ return InstructionCost::getInvalid(CostKind);
EVT ValVT = TLI->getValueType(DL, ValTy);
EVT ResVT = TLI->getValueType(DL, ResTy);
@@ -1939,7 +1941,8 @@ ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
return ST->getMVEVectorCostFactor(CostKind) * LT.first;
}
- return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
+ return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,
+ CostKind);
}
InstructionCost
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index cdd8bcb9f741..0810c5532ed9 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -299,7 +299,8 @@ public:
VectorType *ValTy, std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) const override;
InstructionCost
- getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy,
+ getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy,
+ VectorType *ValTy,
TTI::TargetCostKind CostKind) const override;
InstructionCost
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
index a39629bd8aeb..fa778cad4af8 100644
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -6,7 +6,8 @@ tablegen(LLVM ARMGenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM ARMGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM ARMGenCallingConv.inc -gen-callingconv)
tablegen(LLVM ARMGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler
+ -ignore-non-decodable-operands)
tablegen(LLVM ARMGenFastISel.inc -gen-fast-isel)
tablegen(LLVM ARMGenGlobalISel.inc -gen-global-isel)
tablegen(LLVM ARMGenInstrInfo.inc -gen-instr-info)
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 19fa03cdc668..1d19bc89ccf9 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -161,676 +161,13 @@ private:
// Forward declare these because the autogenerated code will reference them.
// Definitions are further down.
-static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRwithZRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPRwithZRnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeDPairSpacedRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeTSBInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeMemMultipleWritebackInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Adddress,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVCVTImmOperand(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeNEONComplexLane64Instruction(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
uint64_t Address,
const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-template <int shift, int WriteBack>
-static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <bool isSigned, bool isNeg, bool zeroPermitted, int size>
-static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeVpredNOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeRestrictedSPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeRestrictedUPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeRestrictedFPPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-template <bool Writeback>
-static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-template <unsigned MinLog, unsigned MaxLog>
-static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-template <unsigned start>
-static DecodeStatus
-DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
typedef DecodeStatus OperandDecoder(MCInst &Inst, unsigned Val,
uint64_t Address,
const MCDisassembler *Decoder);
-template <bool scalar, OperandDecoder predicate_decoder>
-static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-#include "ARMGenDisassemblerTables.inc"
-
-static MCDisassembler *createARMDisassembler(const Target &T,
- const MCSubtargetInfo &STI,
- MCContext &Ctx) {
- return new ARMDisassembler(STI, Ctx, T.createMCInstrInfo());
-}
-
-// Post-decoding checks
-static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
- uint64_t Address, raw_ostream &CS,
- uint32_t Insn,
- DecodeStatus Result) {
- switch (MI.getOpcode()) {
- case ARM::HVC: {
- // HVC is undefined if condition = 0xf otherwise upredictable
- // if condition != 0xe
- uint32_t Cond = (Insn >> 28) & 0xF;
- if (Cond == 0xF)
- return MCDisassembler::Fail;
- if (Cond != 0xE)
- return MCDisassembler::SoftFail;
- return Result;
- }
- case ARM::t2ADDri:
- case ARM::t2ADDri12:
- case ARM::t2ADDrr:
- case ARM::t2ADDrs:
- case ARM::t2SUBri:
- case ARM::t2SUBri12:
- case ARM::t2SUBrr:
- case ARM::t2SUBrs:
- if (MI.getOperand(0).getReg() == ARM::SP &&
- MI.getOperand(1).getReg() != ARM::SP)
- return MCDisassembler::SoftFail;
- return Result;
- default: return Result;
- }
-}
-
-uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
- uint64_t Address) const {
- // In Arm state, instructions are always 4 bytes wide, so there's no
- // point in skipping any smaller number of bytes if an instruction
- // can't be decoded.
- if (!STI.hasFeature(ARM::ModeThumb))
- return 4;
-
- // In a Thumb instruction stream, a halfword is a standalone 2-byte
- // instruction if and only if its value is less than 0xE800.
- // Otherwise, it's the first halfword of a 4-byte instruction.
- //
- // So, if we can see the upcoming halfword, we can judge on that
- // basis, and maybe skip a whole 4-byte instruction that we don't
- // know how to decode, without accidentally trying to interpret its
- // second half as something else.
- //
- // If we don't have the instruction data available, we just have to
- // recommend skipping the minimum sensible distance, which is 2
- // bytes.
- if (Bytes.size() < 2)
- return 2;
-
- uint16_t Insn16 = llvm::support::endian::read<uint16_t>(
- Bytes.data(), InstructionEndianness);
- return Insn16 < 0xE800 ? 2 : 4;
-}
-
-DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
- ArrayRef<uint8_t> Bytes,
- uint64_t Address,
- raw_ostream &CS) const {
- if (STI.hasFeature(ARM::ModeThumb))
- return getThumbInstruction(MI, Size, Bytes, Address, CS);
- return getARMInstruction(MI, Size, Bytes, Address, CS);
-}
-
-DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size,
- ArrayRef<uint8_t> Bytes,
- uint64_t Address,
- raw_ostream &CS) const {
- CommentStream = &CS;
-
- assert(!STI.hasFeature(ARM::ModeThumb) &&
- "Asked to disassemble an ARM instruction but Subtarget is in Thumb "
- "mode!");
-
- // We want to read exactly 4 bytes of data.
- if (Bytes.size() < 4) {
- Size = 0;
- return MCDisassembler::Fail;
- }
-
- // Encoded as a 32-bit word in the stream.
- uint32_t Insn = llvm::support::endian::read<uint32_t>(Bytes.data(),
- InstructionEndianness);
-
- // Calling the auto-generated decoder function.
- DecodeStatus Result =
- decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
- }
-
- struct DecodeTable {
- const uint8_t *P;
- bool DecodePred;
- };
-
- const DecodeTable Tables[] = {
- {DecoderTableVFP32, false}, {DecoderTableVFPV832, false},
- {DecoderTableNEONData32, true}, {DecoderTableNEONLoadStore32, true},
- {DecoderTableNEONDup32, true}, {DecoderTablev8NEON32, false},
- {DecoderTablev8Crypto32, false},
- };
-
- for (auto Table : Tables) {
- Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- // Add a fake predicate operand, because we share these instruction
- // definitions with Thumb2 where these instructions are predicable.
- if (Table.DecodePred && !DecodePredicateOperand(MI, 0xE, Address, this))
- return MCDisassembler::Fail;
- return Result;
- }
- }
-
- Result =
- decodeInstruction(DecoderTableCoProc32, MI, Insn, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
- }
-
- Size = 4;
- return MCDisassembler::Fail;
-}
/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
/// immediate Value in the MCInst. The immediate Value has had any PC
@@ -868,409 +205,7 @@ static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
Decoder->tryAddingPcLoadReferenceComment(Value, Address);
}
-// Thumb1 instructions don't have explicit S bits. Rather, they
-// implicitly set CPSR. Since it's not represented in the encoding, the
-// auto-generated decoder won't inject the CPSR operand. We need to fix
-// that as a post-pass.
-void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const {
- const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
- MCInst::iterator I = MI.begin();
- for (unsigned i = 0; i < MCID.NumOperands; ++i, ++I) {
- if (I == MI.end()) break;
- if (MCID.operands()[i].isOptionalDef() &&
- MCID.operands()[i].RegClass == ARM::CCRRegClassID) {
- if (i > 0 && MCID.operands()[i - 1].isPredicate())
- continue;
- MI.insert(I,
- MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
- return;
- }
- }
-
- MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
-}
-
-bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const {
- const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
- for (unsigned i = 0; i < MCID.NumOperands; ++i) {
- if (ARM::isVpred(MCID.operands()[i].OperandType))
- return true;
- }
- return false;
-}
-
-// Most Thumb instructions don't have explicit predicates in the
-// encoding, but rather get their predicates from IT context. We need
-// to fix up the predicate operands using this context information as a
-// post-pass.
-MCDisassembler::DecodeStatus
-ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
- MCDisassembler::DecodeStatus S = Success;
-
- const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
-
- // A few instructions actually have predicates encoded in them. Don't
- // try to overwrite it if we're seeing one of those.
- switch (MI.getOpcode()) {
- case ARM::tBcc:
- case ARM::t2Bcc:
- case ARM::tCBZ:
- case ARM::tCBNZ:
- case ARM::tCPS:
- case ARM::t2CPS3p:
- case ARM::t2CPS2p:
- case ARM::t2CPS1p:
- case ARM::t2CSEL:
- case ARM::t2CSINC:
- case ARM::t2CSINV:
- case ARM::t2CSNEG:
- case ARM::tMOVSr:
- case ARM::tSETEND:
- // Some instructions (mostly conditional branches) are not
- // allowed in IT blocks.
- if (ITBlock.instrInITBlock())
- S = SoftFail;
- else
- return Success;
- break;
- case ARM::t2HINT:
- if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0)
- S = SoftFail;
- break;
- case ARM::tB:
- case ARM::t2B:
- case ARM::t2TBB:
- case ARM::t2TBH:
- // Some instructions (mostly unconditional branches) can
- // only appears at the end of, or outside of, an IT.
- if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
- S = SoftFail;
- break;
- default:
- break;
- }
-
- // Warn on non-VPT predicable instruction in a VPT block and a VPT
- // predicable instruction in an IT block
- if ((!isVectorPredicable(MI) && VPTBlock.instrInVPTBlock()) ||
- (isVectorPredicable(MI) && ITBlock.instrInITBlock()))
- S = SoftFail;
-
- // If we're in an IT/VPT block, base the predicate on that. Otherwise,
- // assume a predicate of AL.
- unsigned CC = ARMCC::AL;
- unsigned VCC = ARMVCC::None;
- if (ITBlock.instrInITBlock()) {
- CC = ITBlock.getITCC();
- ITBlock.advanceITState();
- } else if (VPTBlock.instrInVPTBlock()) {
- VCC = VPTBlock.getVPTPred();
- VPTBlock.advanceVPTState();
- }
-
- const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
-
- MCInst::iterator CCI = MI.begin();
- for (unsigned i = 0; i < MCID.NumOperands; ++i, ++CCI) {
- if (MCID.operands()[i].isPredicate() || CCI == MI.end())
- break;
- }
-
- if (MCID.isPredicable()) {
- CCI = MI.insert(CCI, MCOperand::createImm(CC));
- ++CCI;
- if (CC == ARMCC::AL)
- MI.insert(CCI, MCOperand::createReg(ARM::NoRegister));
- else
- MI.insert(CCI, MCOperand::createReg(ARM::CPSR));
- } else if (CC != ARMCC::AL) {
- Check(S, SoftFail);
- }
-
- MCInst::iterator VCCI = MI.begin();
- unsigned VCCPos;
- for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) {
- if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end())
- break;
- }
-
- if (isVectorPredicable(MI)) {
- VCCI = MI.insert(VCCI, MCOperand::createImm(VCC));
- ++VCCI;
- if (VCC == ARMVCC::None)
- VCCI = MI.insert(VCCI, MCOperand::createReg(0));
- else
- VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0));
- ++VCCI;
- VCCI = MI.insert(VCCI, MCOperand::createReg(0));
- ++VCCI;
- if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) {
- int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO);
- assert(TiedOp >= 0 &&
- "Inactive register in vpred_r is not tied to an output!");
- // Copy the operand to ensure it's not invalidated when MI grows.
- MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp)));
- }
- } else if (VCC != ARMVCC::None) {
- Check(S, SoftFail);
- }
-
- return S;
-}
-
-// Thumb VFP instructions are a special case. Because we share their
-// encodings between ARM and Thumb modes, and they are predicable in ARM
-// mode, the auto-generated decoder will give them an (incorrect)
-// predicate operand. We need to rewrite these operands based on the IT
-// context as a post-pass.
-void ARMDisassembler::UpdateThumbVFPPredicate(
- DecodeStatus &S, MCInst &MI) const {
- unsigned CC;
- CC = ITBlock.getITCC();
- if (CC == 0xF)
- CC = ARMCC::AL;
- if (ITBlock.instrInITBlock())
- ITBlock.advanceITState();
- else if (VPTBlock.instrInVPTBlock()) {
- CC = VPTBlock.getVPTPred();
- VPTBlock.advanceVPTState();
- }
-
- const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
- ArrayRef<MCOperandInfo> OpInfo = MCID.operands();
- MCInst::iterator I = MI.begin();
- unsigned short NumOps = MCID.NumOperands;
- for (unsigned i = 0; i < NumOps; ++i, ++I) {
- if (OpInfo[i].isPredicate() ) {
- if (CC != ARMCC::AL && !MCID.isPredicable())
- Check(S, SoftFail);
- I->setImm(CC);
- ++I;
- if (CC == ARMCC::AL)
- I->setReg(ARM::NoRegister);
- else
- I->setReg(ARM::CPSR);
- return;
- }
- }
-}
-
-DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
- ArrayRef<uint8_t> Bytes,
- uint64_t Address,
- raw_ostream &CS) const {
- CommentStream = &CS;
-
- assert(STI.hasFeature(ARM::ModeThumb) &&
- "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
-
- // We want to read exactly 2 bytes of data.
- if (Bytes.size() < 2) {
- Size = 0;
- return MCDisassembler::Fail;
- }
-
- uint16_t Insn16 = llvm::support::endian::read<uint16_t>(
- Bytes.data(), InstructionEndianness);
- DecodeStatus Result =
- decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 2;
- Check(Result, AddThumbPredicate(MI));
- return Result;
- }
-
- Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this,
- STI);
- if (Result) {
- Size = 2;
- bool InITBlock = ITBlock.instrInITBlock();
- Check(Result, AddThumbPredicate(MI));
- AddThumb1SBit(MI, InITBlock);
- return Result;
- }
-
- Result =
- decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 2;
-
- // Nested IT blocks are UNPREDICTABLE. Must be checked before we add
- // the Thumb predicate.
- if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
- Result = MCDisassembler::SoftFail;
-
- Check(Result, AddThumbPredicate(MI));
-
- // If we find an IT instruction, we need to parse its condition
- // code and mask operands so that we can apply them correctly
- // to the subsequent instructions.
- if (MI.getOpcode() == ARM::t2IT) {
- unsigned Firstcond = MI.getOperand(0).getImm();
- unsigned Mask = MI.getOperand(1).getImm();
- ITBlock.setITState(Firstcond, Mask);
-
- // An IT instruction that would give a 'NV' predicate is unpredictable.
- if (Firstcond == ARMCC::AL && !isPowerOf2_32(Mask))
- CS << "unpredictable IT predicate sequence";
- }
-
- return Result;
- }
-
- // We want to read exactly 4 bytes of data.
- if (Bytes.size() < 4) {
- Size = 0;
- return MCDisassembler::Fail;
- }
-
- uint32_t Insn32 =
- (uint32_t(Insn16) << 16) | llvm::support::endian::read<uint16_t>(
- Bytes.data() + 2, InstructionEndianness);
-
- Result =
- decodeInstruction(DecoderTableMVE32, MI, Insn32, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
-
- // Nested VPT blocks are UNPREDICTABLE. Must be checked before we add
- // the VPT predicate.
- if (isVPTOpcode(MI.getOpcode()) && VPTBlock.instrInVPTBlock())
- Result = MCDisassembler::SoftFail;
-
- Check(Result, AddThumbPredicate(MI));
-
- if (isVPTOpcode(MI.getOpcode())) {
- unsigned Mask = MI.getOperand(0).getImm();
- VPTBlock.setVPTState(Mask);
- }
-
- return Result;
- }
-
- Result =
- decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- bool InITBlock = ITBlock.instrInITBlock();
- Check(Result, AddThumbPredicate(MI));
- AddThumb1SBit(MI, InITBlock);
- return Result;
- }
-
- Result =
- decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- Check(Result, AddThumbPredicate(MI));
- return checkDecodedInstruction(MI, Size, Address, CS, Insn32, Result);
- }
-
- if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
- Result =
- decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- UpdateThumbVFPPredicate(Result, MI);
- return Result;
- }
- }
-
- Result =
- decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- return Result;
- }
-
- if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
- Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this,
- STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- Check(Result, AddThumbPredicate(MI));
- return Result;
- }
- }
-
- if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) {
- uint32_t NEONLdStInsn = Insn32;
- NEONLdStInsn &= 0xF0FFFFFF;
- NEONLdStInsn |= 0x04000000;
- Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- Check(Result, AddThumbPredicate(MI));
- return Result;
- }
- }
-
- if (fieldFromInstruction(Insn32, 24, 4) == 0xF) {
- uint32_t NEONDataInsn = Insn32;
- NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
- NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
- NEONDataInsn |= 0x12000000; // Set bits 28 and 25
- Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- Check(Result, AddThumbPredicate(MI));
- return Result;
- }
-
- uint32_t NEONCryptoInsn = Insn32;
- NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
- NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
- NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
- Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- return Result;
- }
-
- uint32_t NEONv8Insn = Insn32;
- NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
- Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
- this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- return Result;
- }
- }
-
- uint32_t Coproc = fieldFromInstruction(Insn32, 8, 4);
- const uint8_t *DecoderTable = ARM::isCDECoproc(Coproc, STI)
- ? DecoderTableThumb2CDE32
- : DecoderTableThumb2CoProc32;
- Result =
- decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- Check(Result, AddThumbPredicate(MI));
- return Result;
- }
-
- // Advance IT state to prevent next instruction inheriting
- // the wrong IT state.
- if (ITBlock.instrInITBlock())
- ITBlock.advanceITState();
- Size = 0;
- return MCDisassembler::Fail;
-}
-
-extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
-LLVMInitializeARMDisassembler() {
- TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
- createARMDisassembler);
- TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
- createARMDisassembler);
- TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(),
- createARMDisassembler);
- TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(),
- createARMDisassembler);
-}
+// Register class decoding functions.
static const uint16_t GPRDecoderTable[] = {
ARM::R0, ARM::R1, ARM::R2, ARM::R3,
@@ -1626,6 +561,51 @@ DecodeDPairSpacedRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+
+ unsigned Register = QPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static const MCPhysReg QQPRDecoderTable[] = {
+ ARM::Q0_Q1, ARM::Q1_Q2, ARM::Q2_Q3, ARM::Q3_Q4,
+ ARM::Q4_Q5, ARM::Q5_Q6, ARM::Q6_Q7
+};
+
+static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ if (RegNo > 6)
+ return MCDisassembler::Fail;
+
+ unsigned Register = QQPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static const MCPhysReg QQQQPRDecoderTable[] = {
+ ARM::Q0_Q1_Q2_Q3, ARM::Q1_Q2_Q3_Q4, ARM::Q2_Q3_Q4_Q5,
+ ARM::Q3_Q4_Q5_Q6, ARM::Q4_Q5_Q6_Q7
+};
+
+static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ if (RegNo > 4)
+ return MCDisassembler::Fail;
+
+ unsigned Register = QQQQPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+// Operand decoding functions.
+
static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -2422,6 +1402,54 @@ static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
return S;
}
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned imod = fieldFromInstruction(Insn, 18, 2);
+ unsigned M = fieldFromInstruction(Insn, 17, 1);
+ unsigned iflags = fieldFromInstruction(Insn, 6, 3);
+ unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+ DecodeStatus S = MCDisassembler::Success;
+
+ // This decoder is called from multiple location that do not check
+ // the full encoding is valid before they do.
+ if (fieldFromInstruction(Insn, 5, 1) != 0 ||
+ fieldFromInstruction(Insn, 16, 1) != 0 ||
+ fieldFromInstruction(Insn, 20, 8) != 0x10)
+ return MCDisassembler::Fail;
+
+ // imod == '01' --> UNPREDICTABLE
+ // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+ // return failure here. The '01' imod value is unprintable, so there's
+ // nothing useful we could do even if we returned UNPREDICTABLE.
+
+ if (imod == 1) return MCDisassembler::Fail;
+
+ if (imod && M) {
+ Inst.setOpcode(ARM::CPS3p);
+ Inst.addOperand(MCOperand::createImm(imod));
+ Inst.addOperand(MCOperand::createImm(iflags));
+ Inst.addOperand(MCOperand::createImm(mode));
+ } else if (imod && !M) {
+ Inst.setOpcode(ARM::CPS2p);
+ Inst.addOperand(MCOperand::createImm(imod));
+ Inst.addOperand(MCOperand::createImm(iflags));
+ if (mode) S = MCDisassembler::SoftFail;
+ } else if (!imod && M) {
+ Inst.setOpcode(ARM::CPS1p);
+ Inst.addOperand(MCOperand::createImm(mode));
+ if (iflags) S = MCDisassembler::SoftFail;
+ } else {
+ // imod == '00' && M == '0' --> UNPREDICTABLE
+ Inst.setOpcode(ARM::CPS1p);
+ Inst.addOperand(MCOperand::createImm(mode));
+ S = MCDisassembler::SoftFail;
+ }
+
+ return S;
+}
+
static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -2562,54 +1590,6 @@ static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
return S;
}
-static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned imod = fieldFromInstruction(Insn, 18, 2);
- unsigned M = fieldFromInstruction(Insn, 17, 1);
- unsigned iflags = fieldFromInstruction(Insn, 6, 3);
- unsigned mode = fieldFromInstruction(Insn, 0, 5);
-
- DecodeStatus S = MCDisassembler::Success;
-
- // This decoder is called from multiple location that do not check
- // the full encoding is valid before they do.
- if (fieldFromInstruction(Insn, 5, 1) != 0 ||
- fieldFromInstruction(Insn, 16, 1) != 0 ||
- fieldFromInstruction(Insn, 20, 8) != 0x10)
- return MCDisassembler::Fail;
-
- // imod == '01' --> UNPREDICTABLE
- // NOTE: Even though this is technically UNPREDICTABLE, we choose to
- // return failure here. The '01' imod value is unprintable, so there's
- // nothing useful we could do even if we returned UNPREDICTABLE.
-
- if (imod == 1) return MCDisassembler::Fail;
-
- if (imod && M) {
- Inst.setOpcode(ARM::CPS3p);
- Inst.addOperand(MCOperand::createImm(imod));
- Inst.addOperand(MCOperand::createImm(iflags));
- Inst.addOperand(MCOperand::createImm(mode));
- } else if (imod && !M) {
- Inst.setOpcode(ARM::CPS2p);
- Inst.addOperand(MCOperand::createImm(imod));
- Inst.addOperand(MCOperand::createImm(iflags));
- if (mode) S = MCDisassembler::SoftFail;
- } else if (!imod && M) {
- Inst.setOpcode(ARM::CPS1p);
- Inst.addOperand(MCOperand::createImm(mode));
- if (iflags) S = MCDisassembler::SoftFail;
- } else {
- // imod == '00' && M == '0' --> UNPREDICTABLE
- Inst.setOpcode(ARM::CPS1p);
- Inst.addOperand(MCOperand::createImm(mode));
- S = MCDisassembler::SoftFail;
- }
-
- return S;
-}
-
static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -2760,28 +1740,6 @@ static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
return S;
}
-static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- DecodeStatus S = MCDisassembler::Success;
-
- unsigned Pred = fieldFromInstruction(Insn, 28, 4);
- unsigned Rn = fieldFromInstruction(Insn, 16, 4);
- unsigned Rm = fieldFromInstruction(Insn, 0, 4);
-
- if (Pred == 0xF)
- return DecodeSETPANInstruction(Inst, Insn, Address, Decoder);
-
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
- return MCDisassembler::Fail;
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
- return MCDisassembler::Fail;
- if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder)))
- return MCDisassembler::Fail;
-
- return S;
-}
-
static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -2811,6 +1769,28 @@ static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
return S;
}
+static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Pred = fieldFromInstruction(Insn, 28, 4);
+ unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+ if (Pred == 0xF)
+ return DecodeSETPANInstruction(Inst, Insn, Address, Decoder);
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ return MCDisassembler::Fail;
+ if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder)))
+ return MCDisassembler::Fail;
+
+ return S;
+}
+
static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -3232,61 +2212,6 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
return S;
}
-static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned type = fieldFromInstruction(Insn, 8, 4);
- unsigned align = fieldFromInstruction(Insn, 4, 2);
- if (type == 6 && (align & 2)) return MCDisassembler::Fail;
- if (type == 7 && (align & 2)) return MCDisassembler::Fail;
- if (type == 10 && align == 3) return MCDisassembler::Fail;
-
- unsigned load = fieldFromInstruction(Insn, 21, 1);
- return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
- : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
-}
-
-static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned size = fieldFromInstruction(Insn, 6, 2);
- if (size == 3) return MCDisassembler::Fail;
-
- unsigned type = fieldFromInstruction(Insn, 8, 4);
- unsigned align = fieldFromInstruction(Insn, 4, 2);
- if (type == 8 && align == 3) return MCDisassembler::Fail;
- if (type == 9 && align == 3) return MCDisassembler::Fail;
-
- unsigned load = fieldFromInstruction(Insn, 21, 1);
- return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
- : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
-}
-
-static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned size = fieldFromInstruction(Insn, 6, 2);
- if (size == 3) return MCDisassembler::Fail;
-
- unsigned align = fieldFromInstruction(Insn, 4, 2);
- if (align & 2) return MCDisassembler::Fail;
-
- unsigned load = fieldFromInstruction(Insn, 21, 1);
- return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
- : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
-}
-
-static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned size = fieldFromInstruction(Insn, 6, 2);
- if (size == 3) return MCDisassembler::Fail;
-
- unsigned load = fieldFromInstruction(Insn, 21, 1);
- return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
- : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
-}
-
static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -3558,6 +2483,61 @@ static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
return S;
}
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned type = fieldFromInstruction(Insn, 8, 4);
+ unsigned align = fieldFromInstruction(Insn, 4, 2);
+ if (type == 6 && (align & 2)) return MCDisassembler::Fail;
+ if (type == 7 && (align & 2)) return MCDisassembler::Fail;
+ if (type == 10 && align == 3) return MCDisassembler::Fail;
+
+ unsigned load = fieldFromInstruction(Insn, 21, 1);
+ return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+ : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned size = fieldFromInstruction(Insn, 6, 2);
+ if (size == 3) return MCDisassembler::Fail;
+
+ unsigned type = fieldFromInstruction(Insn, 8, 4);
+ unsigned align = fieldFromInstruction(Insn, 4, 2);
+ if (type == 8 && align == 3) return MCDisassembler::Fail;
+ if (type == 9 && align == 3) return MCDisassembler::Fail;
+
+ unsigned load = fieldFromInstruction(Insn, 21, 1);
+ return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+ : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned size = fieldFromInstruction(Insn, 6, 2);
+ if (size == 3) return MCDisassembler::Fail;
+
+ unsigned align = fieldFromInstruction(Insn, 4, 2);
+ if (align & 2) return MCDisassembler::Fail;
+
+ unsigned load = fieldFromInstruction(Insn, 21, 1);
+ return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+ : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned size = fieldFromInstruction(Insn, 6, 2);
+ if (size == 3) return MCDisassembler::Fail;
+
+ unsigned load = fieldFromInstruction(Insn, 21, 1);
+ return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+ : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -4063,6 +3043,60 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
return S;
}
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+ unsigned U = fieldFromInstruction(Insn, 23, 1);
+ int imm = fieldFromInstruction(Insn, 0, 12);
+
+ const FeatureBitset &featureBits =
+ Decoder->getSubtargetInfo().getFeatureBits();
+
+ bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+ if (Rt == 15) {
+ switch (Inst.getOpcode()) {
+ case ARM::t2LDRBpci:
+ case ARM::t2LDRHpci:
+ Inst.setOpcode(ARM::t2PLDpci);
+ break;
+ case ARM::t2LDRSBpci:
+ Inst.setOpcode(ARM::t2PLIpci);
+ break;
+ case ARM::t2LDRSHpci:
+ return MCDisassembler::Fail;
+ default:
+ break;
+ }
+ }
+
+ switch(Inst.getOpcode()) {
+ case ARM::t2PLDpci:
+ break;
+ case ARM::t2PLIpci:
+ if (!hasV7Ops)
+ return MCDisassembler::Fail;
+ break;
+ default:
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+ return MCDisassembler::Fail;
+ }
+
+ if (!U) {
+ // Special case for #-0.
+ if (imm == 0)
+ imm = INT32_MIN;
+ else
+ imm = -imm;
+ }
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return S;
+}
+
static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -4232,6 +3266,33 @@ static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
return S;
}
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ unsigned Rn = fieldFromInstruction(Val, 13, 4);
+ unsigned imm = fieldFromInstruction(Val, 0, 12);
+
+ // Thumb stores cannot use PC as dest register.
+ switch (Inst.getOpcode()) {
+ case ARM::t2STRi12:
+ case ARM::t2STRBi12:
+ case ARM::t2STRHi12:
+ if (Rn == 15)
+ return MCDisassembler::Fail;
+ break;
+ default:
+ break;
+ }
+
+ if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(imm));
+
+ return S;
+}
+
static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -4352,60 +3413,6 @@ static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, uint64_t Address,
return S;
}
-static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- DecodeStatus S = MCDisassembler::Success;
-
- unsigned Rt = fieldFromInstruction(Insn, 12, 4);
- unsigned U = fieldFromInstruction(Insn, 23, 1);
- int imm = fieldFromInstruction(Insn, 0, 12);
-
- const FeatureBitset &featureBits =
- Decoder->getSubtargetInfo().getFeatureBits();
-
- bool hasV7Ops = featureBits[ARM::HasV7Ops];
-
- if (Rt == 15) {
- switch (Inst.getOpcode()) {
- case ARM::t2LDRBpci:
- case ARM::t2LDRHpci:
- Inst.setOpcode(ARM::t2PLDpci);
- break;
- case ARM::t2LDRSBpci:
- Inst.setOpcode(ARM::t2PLIpci);
- break;
- case ARM::t2LDRSHpci:
- return MCDisassembler::Fail;
- default:
- break;
- }
- }
-
- switch(Inst.getOpcode()) {
- case ARM::t2PLDpci:
- break;
- case ARM::t2PLIpci:
- if (!hasV7Ops)
- return MCDisassembler::Fail;
- break;
- default:
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
- return MCDisassembler::Fail;
- }
-
- if (!U) {
- // Special case for #-0.
- if (imm == 0)
- imm = INT32_MIN;
- else
- imm = -imm;
- }
- Inst.addOperand(MCOperand::createImm(imm));
-
- return S;
-}
-
static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, uint64_t Address,
const MCDisassembler *Decoder) {
if (Val == 0)
@@ -4655,33 +3662,6 @@ static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
return S;
}
-static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- DecodeStatus S = MCDisassembler::Success;
-
- unsigned Rn = fieldFromInstruction(Val, 13, 4);
- unsigned imm = fieldFromInstruction(Val, 0, 12);
-
- // Thumb stores cannot use PC as dest register.
- switch (Inst.getOpcode()) {
- case ARM::t2STRi12:
- case ARM::t2STRBi12:
- case ARM::t2STRHi12:
- if (Rn == 15)
- return MCDisassembler::Fail;
- break;
- default:
- break;
- }
-
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
- return MCDisassembler::Fail;
- Inst.addOperand(MCOperand::createImm(imm));
-
- return S;
-}
-
static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -4844,6 +3824,16 @@ static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
return S;
}
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ if (Val & ~0xf)
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -4951,16 +3941,6 @@ static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (Val & ~0xf)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::createImm(Val));
- return MCDisassembler::Success;
-}
-
static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -6475,49 +5455,6 @@ static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
return S;
}
-static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 7)
- return MCDisassembler::Fail;
-
- unsigned Register = QPRDecoderTable[RegNo];
- Inst.addOperand(MCOperand::createReg(Register));
- return MCDisassembler::Success;
-}
-
-static const MCPhysReg QQPRDecoderTable[] = {
- ARM::Q0_Q1, ARM::Q1_Q2, ARM::Q2_Q3, ARM::Q3_Q4,
- ARM::Q4_Q5, ARM::Q5_Q6, ARM::Q6_Q7
-};
-
-static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 6)
- return MCDisassembler::Fail;
-
- unsigned Register = QQPRDecoderTable[RegNo];
- Inst.addOperand(MCOperand::createReg(Register));
- return MCDisassembler::Success;
-}
-
-static const MCPhysReg QQQQPRDecoderTable[] = {
- ARM::Q0_Q1_Q2_Q3, ARM::Q1_Q2_Q3_Q4, ARM::Q2_Q3_Q4_Q5,
- ARM::Q3_Q4_Q5_Q6, ARM::Q4_Q5_Q6_Q7
-};
-
-static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 4)
- return MCDisassembler::Fail;
-
- unsigned Register = QQQQPRDecoderTable[RegNo];
- Inst.addOperand(MCOperand::createReg(Register));
- return MCDisassembler::Success;
-}
-
static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -7069,3 +6006,547 @@ static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
return S;
}
+
+#include "ARMGenDisassemblerTables.inc"
+
+// Post-decoding checks
+static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
+ uint64_t Address, raw_ostream &CS,
+ uint32_t Insn,
+ DecodeStatus Result) {
+ switch (MI.getOpcode()) {
+ case ARM::HVC: {
+ // HVC is undefined if condition = 0xf otherwise upredictable
+ // if condition != 0xe
+ uint32_t Cond = (Insn >> 28) & 0xF;
+ if (Cond == 0xF)
+ return MCDisassembler::Fail;
+ if (Cond != 0xE)
+ return MCDisassembler::SoftFail;
+ return Result;
+ }
+ case ARM::t2ADDri:
+ case ARM::t2ADDri12:
+ case ARM::t2ADDrr:
+ case ARM::t2ADDrs:
+ case ARM::t2SUBri:
+ case ARM::t2SUBri12:
+ case ARM::t2SUBrr:
+ case ARM::t2SUBrs:
+ if (MI.getOperand(0).getReg() == ARM::SP &&
+ MI.getOperand(1).getReg() != ARM::SP)
+ return MCDisassembler::SoftFail;
+ return Result;
+ default: return Result;
+ }
+}
+
+uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const {
+ // In Arm state, instructions are always 4 bytes wide, so there's no
+ // point in skipping any smaller number of bytes if an instruction
+ // can't be decoded.
+ if (!STI.hasFeature(ARM::ModeThumb))
+ return 4;
+
+ // In a Thumb instruction stream, a halfword is a standalone 2-byte
+ // instruction if and only if its value is less than 0xE800.
+ // Otherwise, it's the first halfword of a 4-byte instruction.
+ //
+ // So, if we can see the upcoming halfword, we can judge on that
+ // basis, and maybe skip a whole 4-byte instruction that we don't
+ // know how to decode, without accidentally trying to interpret its
+ // second half as something else.
+ //
+ // If we don't have the instruction data available, we just have to
+ // recommend skipping the minimum sensible distance, which is 2
+ // bytes.
+ if (Bytes.size() < 2)
+ return 2;
+
+ uint16_t Insn16 = llvm::support::endian::read<uint16_t>(
+ Bytes.data(), InstructionEndianness);
+ return Insn16 < 0xE800 ? 2 : 4;
+}
+
+DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CS) const {
+ if (STI.hasFeature(ARM::ModeThumb))
+ return getThumbInstruction(MI, Size, Bytes, Address, CS);
+ return getARMInstruction(MI, Size, Bytes, Address, CS);
+}
+
+DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+
+ assert(!STI.hasFeature(ARM::ModeThumb) &&
+ "Asked to disassemble an ARM instruction but Subtarget is in Thumb "
+ "mode!");
+
+ // We want to read exactly 4 bytes of data.
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ // Encoded as a 32-bit word in the stream.
+ uint32_t Insn = llvm::support::endian::read<uint32_t>(Bytes.data(),
+ InstructionEndianness);
+
+ // Calling the auto-generated decoder function.
+ DecodeStatus Result =
+ decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
+ }
+
+ struct DecodeTable {
+ const uint8_t *P;
+ bool DecodePred;
+ };
+
+ const DecodeTable Tables[] = {
+ {DecoderTableVFP32, false}, {DecoderTableVFPV832, false},
+ {DecoderTableNEONData32, true}, {DecoderTableNEONLoadStore32, true},
+ {DecoderTableNEONDup32, true}, {DecoderTablev8NEON32, false},
+ {DecoderTablev8Crypto32, false},
+ };
+
+ for (auto Table : Tables) {
+ Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ // Add a fake predicate operand, because we share these instruction
+ // definitions with Thumb2 where these instructions are predicable.
+ if (Table.DecodePred && !DecodePredicateOperand(MI, 0xE, Address, this))
+ return MCDisassembler::Fail;
+ return Result;
+ }
+ }
+
+ Result =
+ decodeInstruction(DecoderTableCoProc32, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
+ }
+
+ Size = 4;
+ return MCDisassembler::Fail;
+}
+
+// Thumb1 instructions don't have explicit S bits. Rather, they
+// implicitly set CPSR. Since it's not represented in the encoding, the
+// auto-generated decoder won't inject the CPSR operand. We need to fix
+// that as a post-pass.
+void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const {
+ const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
+ MCInst::iterator I = MI.begin();
+ for (unsigned i = 0; i < MCID.NumOperands; ++i, ++I) {
+ if (I == MI.end()) break;
+ if (MCID.operands()[i].isOptionalDef() &&
+ MCID.operands()[i].RegClass == ARM::CCRRegClassID) {
+ if (i > 0 && MCID.operands()[i - 1].isPredicate())
+ continue;
+ MI.insert(I,
+ MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
+ return;
+ }
+ }
+
+ MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
+}
+
+bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const {
+ const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
+ for (unsigned i = 0; i < MCID.NumOperands; ++i) {
+ if (ARM::isVpred(MCID.operands()[i].OperandType))
+ return true;
+ }
+ return false;
+}
+
+// Most Thumb instructions don't have explicit predicates in the
+// encoding, but rather get their predicates from IT context. We need
+// to fix up the predicate operands using this context information as a
+// post-pass.
+MCDisassembler::DecodeStatus
+ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
+ MCDisassembler::DecodeStatus S = Success;
+
+ const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
+
+ // A few instructions actually have predicates encoded in them. Don't
+ // try to overwrite it if we're seeing one of those.
+ switch (MI.getOpcode()) {
+ case ARM::tBcc:
+ case ARM::t2Bcc:
+ case ARM::tCBZ:
+ case ARM::tCBNZ:
+ case ARM::tCPS:
+ case ARM::t2CPS3p:
+ case ARM::t2CPS2p:
+ case ARM::t2CPS1p:
+ case ARM::t2CSEL:
+ case ARM::t2CSINC:
+ case ARM::t2CSINV:
+ case ARM::t2CSNEG:
+ case ARM::tMOVSr:
+ case ARM::tSETEND:
+ // Some instructions (mostly conditional branches) are not
+ // allowed in IT blocks.
+ if (ITBlock.instrInITBlock())
+ S = SoftFail;
+ else
+ return Success;
+ break;
+ case ARM::t2HINT:
+ if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0)
+ S = SoftFail;
+ break;
+ case ARM::tB:
+ case ARM::t2B:
+ case ARM::t2TBB:
+ case ARM::t2TBH:
+ // Some instructions (mostly unconditional branches) can
+ // only appears at the end of, or outside of, an IT.
+ if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
+ S = SoftFail;
+ break;
+ default:
+ break;
+ }
+
+ // Warn on non-VPT predicable instruction in a VPT block and a VPT
+ // predicable instruction in an IT block
+ if ((!isVectorPredicable(MI) && VPTBlock.instrInVPTBlock()) ||
+ (isVectorPredicable(MI) && ITBlock.instrInITBlock()))
+ S = SoftFail;
+
+ // If we're in an IT/VPT block, base the predicate on that. Otherwise,
+ // assume a predicate of AL.
+ unsigned CC = ARMCC::AL;
+ unsigned VCC = ARMVCC::None;
+ if (ITBlock.instrInITBlock()) {
+ CC = ITBlock.getITCC();
+ ITBlock.advanceITState();
+ } else if (VPTBlock.instrInVPTBlock()) {
+ VCC = VPTBlock.getVPTPred();
+ VPTBlock.advanceVPTState();
+ }
+
+ const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
+
+ MCInst::iterator CCI = MI.begin();
+ for (unsigned i = 0; i < MCID.NumOperands; ++i, ++CCI) {
+ if (MCID.operands()[i].isPredicate() || CCI == MI.end())
+ break;
+ }
+
+ if (MCID.isPredicable()) {
+ CCI = MI.insert(CCI, MCOperand::createImm(CC));
+ ++CCI;
+ if (CC == ARMCC::AL)
+ MI.insert(CCI, MCOperand::createReg(ARM::NoRegister));
+ else
+ MI.insert(CCI, MCOperand::createReg(ARM::CPSR));
+ } else if (CC != ARMCC::AL) {
+ Check(S, SoftFail);
+ }
+
+ MCInst::iterator VCCI = MI.begin();
+ unsigned VCCPos;
+ for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) {
+ if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end())
+ break;
+ }
+
+ if (isVectorPredicable(MI)) {
+ VCCI = MI.insert(VCCI, MCOperand::createImm(VCC));
+ ++VCCI;
+ if (VCC == ARMVCC::None)
+ VCCI = MI.insert(VCCI, MCOperand::createReg(0));
+ else
+ VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0));
+ ++VCCI;
+ VCCI = MI.insert(VCCI, MCOperand::createReg(0));
+ ++VCCI;
+ if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) {
+ int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO);
+ assert(TiedOp >= 0 &&
+ "Inactive register in vpred_r is not tied to an output!");
+ // Copy the operand to ensure it's not invalidated when MI grows.
+ MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp)));
+ }
+ } else if (VCC != ARMVCC::None) {
+ Check(S, SoftFail);
+ }
+
+ return S;
+}
+
+// Thumb VFP instructions are a special case. Because we share their
+// encodings between ARM and Thumb modes, and they are predicable in ARM
+// mode, the auto-generated decoder will give them an (incorrect)
+// predicate operand. We need to rewrite these operands based on the IT
+// context as a post-pass.
+void ARMDisassembler::UpdateThumbVFPPredicate(
+ DecodeStatus &S, MCInst &MI) const {
+ unsigned CC;
+ CC = ITBlock.getITCC();
+ if (CC == 0xF)
+ CC = ARMCC::AL;
+ if (ITBlock.instrInITBlock())
+ ITBlock.advanceITState();
+ else if (VPTBlock.instrInVPTBlock()) {
+ CC = VPTBlock.getVPTPred();
+ VPTBlock.advanceVPTState();
+ }
+
+ const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
+ ArrayRef<MCOperandInfo> OpInfo = MCID.operands();
+ MCInst::iterator I = MI.begin();
+ unsigned short NumOps = MCID.NumOperands;
+ for (unsigned i = 0; i < NumOps; ++i, ++I) {
+ if (OpInfo[i].isPredicate() ) {
+ if (CC != ARMCC::AL && !MCID.isPredicable())
+ Check(S, SoftFail);
+ I->setImm(CC);
+ ++I;
+ if (CC == ARMCC::AL)
+ I->setReg(ARM::NoRegister);
+ else
+ I->setReg(ARM::CPSR);
+ return;
+ }
+ }
+}
+
+DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+
+ assert(STI.hasFeature(ARM::ModeThumb) &&
+ "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
+
+ // We want to read exactly 2 bytes of data.
+ if (Bytes.size() < 2) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ uint16_t Insn16 = llvm::support::endian::read<uint16_t>(
+ Bytes.data(), InstructionEndianness);
+ DecodeStatus Result =
+ decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 2;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+
+ Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this,
+ STI);
+ if (Result) {
+ Size = 2;
+ bool InITBlock = ITBlock.instrInITBlock();
+ Check(Result, AddThumbPredicate(MI));
+ AddThumb1SBit(MI, InITBlock);
+ return Result;
+ }
+
+ Result =
+ decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 2;
+
+ // Nested IT blocks are UNPREDICTABLE. Must be checked before we add
+ // the Thumb predicate.
+ if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
+ Result = MCDisassembler::SoftFail;
+
+ Check(Result, AddThumbPredicate(MI));
+
+ // If we find an IT instruction, we need to parse its condition
+ // code and mask operands so that we can apply them correctly
+ // to the subsequent instructions.
+ if (MI.getOpcode() == ARM::t2IT) {
+ unsigned Firstcond = MI.getOperand(0).getImm();
+ unsigned Mask = MI.getOperand(1).getImm();
+ ITBlock.setITState(Firstcond, Mask);
+
+ // An IT instruction that would give a 'NV' predicate is unpredictable.
+ if (Firstcond == ARMCC::AL && !isPowerOf2_32(Mask))
+ CS << "unpredictable IT predicate sequence";
+ }
+
+ return Result;
+ }
+
+ // We want to read exactly 4 bytes of data.
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ uint32_t Insn32 =
+ (uint32_t(Insn16) << 16) | llvm::support::endian::read<uint16_t>(
+ Bytes.data() + 2, InstructionEndianness);
+
+ Result =
+ decodeInstruction(DecoderTableMVE32, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+
+ // Nested VPT blocks are UNPREDICTABLE. Must be checked before we add
+ // the VPT predicate.
+ if (isVPTOpcode(MI.getOpcode()) && VPTBlock.instrInVPTBlock())
+ Result = MCDisassembler::SoftFail;
+
+ Check(Result, AddThumbPredicate(MI));
+
+ if (isVPTOpcode(MI.getOpcode())) {
+ unsigned Mask = MI.getOperand(0).getImm();
+ VPTBlock.setVPTState(Mask);
+ }
+
+ return Result;
+ }
+
+ Result =
+ decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ bool InITBlock = ITBlock.instrInITBlock();
+ Check(Result, AddThumbPredicate(MI));
+ AddThumb1SBit(MI, InITBlock);
+ return Result;
+ }
+
+ Result =
+ decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ Check(Result, AddThumbPredicate(MI));
+ return checkDecodedInstruction(MI, Size, Address, CS, Insn32, Result);
+ }
+
+ if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
+ Result =
+ decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ UpdateThumbVFPPredicate(Result, MI);
+ return Result;
+ }
+ }
+
+ Result =
+ decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
+ Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this,
+ STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+ }
+
+ if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) {
+ uint32_t NEONLdStInsn = Insn32;
+ NEONLdStInsn &= 0xF0FFFFFF;
+ NEONLdStInsn |= 0x04000000;
+ Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+ }
+
+ if (fieldFromInstruction(Insn32, 24, 4) == 0xF) {
+ uint32_t NEONDataInsn = Insn32;
+ NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
+ NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+ NEONDataInsn |= 0x12000000; // Set bits 28 and 25
+ Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+
+ uint32_t NEONCryptoInsn = Insn32;
+ NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
+ NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+ NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
+ Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ uint32_t NEONv8Insn = Insn32;
+ NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
+ Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ uint32_t Coproc = fieldFromInstruction(Insn32, 8, 4);
+ const uint8_t *DecoderTable = ARM::isCDECoproc(Coproc, STI)
+ ? DecoderTableThumb2CDE32
+ : DecoderTableThumb2CoProc32;
+ Result =
+ decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ Check(Result, AddThumbPredicate(MI));
+ return Result;
+ }
+
+ // Advance IT state to prevent next instruction inheriting
+ // the wrong IT state.
+ if (ITBlock.instrInITBlock())
+ ITBlock.advanceITState();
+ Size = 0;
+ return MCDisassembler::Fail;
+}
+
+static MCDisassembler *createARMDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new ARMDisassembler(STI, Ctx, T.createMCInstrInfo());
+}
+
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMDisassembler() {
+ TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
+ createARMDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
+ createARMDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(),
+ createARMDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(),
+ createARMDisassembler);
+}
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index bb07d79c9374..50f4042102bf 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -457,5 +457,4 @@ Pass *llvm::createMVETailPredicationPass() {
char MVETailPredication::ID = 0;
-INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false)
-INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false)
+INITIALIZE_PASS(MVETailPredication, DEBUG_TYPE, DESC, false, false)
diff --git a/llvm/lib/Target/ARM/README.txt b/llvm/lib/Target/ARM/README.txt
index def67cfae727..ff84e07fa084 100644
--- a/llvm/lib/Target/ARM/README.txt
+++ b/llvm/lib/Target/ARM/README.txt
@@ -697,22 +697,6 @@ target-neutral one.
//===---------------------------------------------------------------------===//
-Optimize unnecessary checks for zero with __builtin_clz/ctz. Those builtins
-are specified to be undefined at zero, so portable code must check for zero
-and handle it as a special case. That is unnecessary on ARM where those
-operations are implemented in a way that is well-defined for zero. For
-example:
-
-int f(int x) { return x ? __builtin_clz(x) : sizeof(int)*8; }
-
-should just be implemented with a CLZ instruction. Since there are other
-targets, e.g., PPC, that share this behavior, it would be best to implement
-this in a target-independent way: we should probably fold that (when using
-"undefined at zero" semantics) to set the "defined at zero" bit and have
-the code generator expand out the right code.
-
-//===---------------------------------------------------------------------===//
-
Clean up the test/MC/ARM files to have more robust register choices.
R0 should not be used as a register operand in the assembler tests as it's then
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 8b254fafc438..e91441b12fe6 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -263,11 +263,14 @@ void Thumb2InstrInfo::expandLoadStackGuard(
const auto *GV = cast<GlobalValue>((*MI->memoperands_begin())->getValue());
const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
+ bool IsPIC = MF.getTarget().isPositionIndependent();
if (Subtarget.isTargetELF() && !GV->isDSOLocal())
expandLoadStackGuardBase(MI, ARM::t2LDRLIT_ga_pcrel, ARM::t2LDRi12);
else if (!Subtarget.useMovt())
- expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::t2LDRi12);
- else if (MF.getTarget().isPositionIndependent())
+ expandLoadStackGuardBase(
+ MI, IsPIC ? ARM::t2LDRLIT_ga_pcrel : ARM::tLDRLIT_ga_abs,
+ ARM::t2LDRi12);
+ else if (IsPIC)
expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12);
else
expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12);
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 0fb33cdcb62d..ad8f7d801843 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -245,7 +245,7 @@ void AVRAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) {
bool AVRAsmPrinter::doFinalization(Module &M) {
const TargetLoweringObjectFile &TLOF = getObjFileLowering();
const AVRTargetMachine &TM = (const AVRTargetMachine &)MMI->getTarget();
- const AVRSubtarget *SubTM = (const AVRSubtarget *)TM.getSubtargetImpl();
+ const AVRSubtarget *SubTM = TM.getSubtargetImpl();
bool NeedsCopyData = false;
bool NeedsClearBSS = false;
@@ -294,7 +294,7 @@ bool AVRAsmPrinter::doFinalization(Module &M) {
void AVRAsmPrinter::emitStartOfAsmFile(Module &M) {
const AVRTargetMachine &TM = (const AVRTargetMachine &)MMI->getTarget();
- const AVRSubtarget *SubTM = (const AVRSubtarget *)TM.getSubtargetImpl();
+ const AVRSubtarget *SubTM = TM.getSubtargetImpl();
if (!SubTM)
return;
diff --git a/llvm/lib/Target/AVR/AVRInstrFormats.td b/llvm/lib/Target/AVR/AVRInstrFormats.td
index e1e65b56370c..72ea3bc1f460 100644
--- a/llvm/lib/Target/AVR/AVRInstrFormats.td
+++ b/llvm/lib/Target/AVR/AVRInstrFormats.td
@@ -79,6 +79,7 @@ class FRdRr<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
//===----------------------------------------------------------------------===//
class FZRd<bits<3> t, dag outs, dag ins, string asmstr, list<dag> pattern>
: AVRInst16<outs, ins, asmstr, pattern> {
+ bits<0> z;
bits<5> rd;
let Inst{15 - 12} = 0b1001;
@@ -127,8 +128,6 @@ class FRd<bits<4> opcode, bits<7> f, dag outs, dag ins, string asmstr,
let Inst{11 - 9} = f{6 - 4};
let Inst{8 - 4} = rd;
let Inst{3 - 0} = f{3 - 0};
-
- let DecoderMethod = "decodeFRd";
}
//===----------------------------------------------------------------------===//
@@ -200,57 +199,64 @@ class FSTLD<bit type, bits<2> mode, dag outs, dag ins, string asmstr,
//===---------------------------------------------------------------------===//
class FLPMX<bit e, bit p, dag outs, dag ins, string asmstr, list<dag> pattern>
: AVRInst16<outs, ins, asmstr, pattern> {
+ bits<0> z;
bits<5> rd;
- let Inst{15 - 12} = 0b1001;
-
- let Inst{11 - 9} = 0b000;
- let Inst{8} = rd{4};
-
- let Inst{7 - 4} = rd{3 - 0};
-
+ let Inst{15 - 9} = 0b1001000;
+ let Inst{8 - 4} = rd;
let Inst{3 - 2} = 0b01;
let Inst{1} = e;
let Inst{0} = p;
-
- let DecoderMethod = "decodeFLPMX";
}
//===----------------------------------------------------------------------===//
// MOVWRdRr special encoding: <|0000|0001|dddd|rrrr|>
// d = destination = 4 bits
// r = source = 4 bits
-// (Only accepts even registers)
+// (Only accepts register pairs)
//===----------------------------------------------------------------------===//
class FMOVWRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
: AVRInst16<outs, ins, asmstr, pattern> {
- bits<5> rd;
- bits<5> rr;
+ bits<4> rd;
+ bits<4> rr;
let Inst{15 - 8} = 0b00000001;
- let Inst{7 - 4} = rd{4 - 1};
- let Inst{3 - 0} = rr{4 - 1};
-
- let DecoderMethod = "decodeFMOVWRdRr";
+ let Inst{7 - 4} = rd;
+ let Inst{3 - 0} = rr;
}
//===----------------------------------------------------------------------===//
-// MULSrr special encoding: <|0000|0010|dddd|rrrr|>
+// MULS special encoding: <|0000|0010|dddd|rrrr|>
// d = multiplicand = 4 bits
// r = multiplier = 4 bits
// (Only accepts r16-r31)
//===----------------------------------------------------------------------===//
-class FMUL2RdRr<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+class FMULSRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
: AVRInst16<outs, ins, asmstr, pattern> {
- bits<5> rd; // accept 5 bits but only encode the lower 4
- bits<5> rr; // accept 5 bits but only encode the lower 4
+ bits<4> rd;
+ bits<4> rr;
- let Inst{15 - 9} = 0b0000001;
- let Inst{8} = f;
- let Inst{7 - 4} = rd{3 - 0};
- let Inst{3 - 0} = rr{3 - 0};
+ let Inst{15 - 8} = 0b00000010;
+ let Inst{7 - 4} = rd;
+ let Inst{3 - 0} = rr;
+}
- let DecoderMethod = "decodeFMUL2RdRr";
+//===----------------------------------------------------------------------===//
+// MULSU special encoding: <|0000|0011|0ddd|0rrr|>
+// d = multiplicand = 3 bits
+// r = multiplier = 3 bits
+// (Only accepts r16-r23)
+//===----------------------------------------------------------------------===//
+class FMULSURdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : AVRInst16<outs, ins, asmstr, pattern> {
+ bits<3> rd;
+ bits<3> rr;
+
+ let Inst{15 - 8} = 0b00000011;
+ let Inst{7} = 0;
+ let Inst{6 - 4} = rd;
+ let Inst{3} = 0;
+ let Inst{2 - 0} = rr;
}
// Special encoding for the FMUL family of instructions.
@@ -273,8 +279,6 @@ class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{6 - 4} = rd;
let Inst{3} = f{0};
let Inst{2 - 0} = rr;
-
- let DecoderMethod = "decodeFFMULRdRr";
}
//===----------------------------------------------------------------------===//
@@ -286,16 +290,14 @@ class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
//===----------------------------------------------------------------------===//
class FWRdK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
: AVRInst16<outs, ins, asmstr, pattern> {
- bits<5> rd; // accept 5 bits but only encode bits 1 and 2
+ bits<2> rd;
bits<6> k;
let Inst{15 - 9} = 0b1001011;
let Inst{8} = f;
let Inst{7 - 6} = k{5 - 4};
- let Inst{5 - 4} = rd{2 - 1};
+ let Inst{5 - 4} = rd;
let Inst{3 - 0} = k{3 - 0};
-
- let DecoderMethod = "decodeFWRdK";
}
//===----------------------------------------------------------------------===//
@@ -313,8 +315,6 @@ class FIORdA<dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{10 - 9} = A{5 - 4};
let Inst{8 - 4} = rd;
let Inst{3 - 0} = A{3 - 0};
-
- let DecoderMethod = "decodeFIORdA";
}
//===----------------------------------------------------------------------===//
@@ -332,8 +332,6 @@ class FIOARr<dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{10 - 9} = A{5 - 4};
let Inst{8 - 4} = rr;
let Inst{3 - 0} = A{3 - 0};
-
- let DecoderMethod = "decodeFIOARr";
}
//===----------------------------------------------------------------------===//
@@ -348,17 +346,10 @@ class FIOBIT<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
bits<5> addr;
bits<3> b;
- let Inst{15 - 12} = 0b1001;
-
- let Inst{11 - 10} = 0b10;
+ let Inst{15 - 10} = 0b100110;
let Inst{9 - 8} = t;
-
- let Inst{7 - 4} = addr{4 - 1};
-
- let Inst{3} = addr{0};
+ let Inst{7 - 3} = addr;
let Inst{2 - 0} = b{2 - 0};
-
- let DecoderMethod = "decodeFIOBIT";
}
//===----------------------------------------------------------------------===//
@@ -417,8 +408,6 @@ class FBRsk<bit f, bits<3> s, dag outs, dag ins, string asmstr,
let Inst{10} = f;
let Inst{9 - 3} = k;
let Inst{2 - 0} = s;
-
- let DecoderMethod = "decodeCondBranch";
}
//===----------------------------------------------------------------------===//
@@ -442,8 +431,6 @@ class FBRk<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{15 - 13} = 0b110;
let Inst{12} = f;
let Inst{11 - 0} = k;
-
- let DecoderMethod = "decodeFBRk";
}
//===----------------------------------------------------------------------===//
@@ -537,14 +524,8 @@ class FSK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{11} = 0;
let Inst{10} = f;
- let Inst{9 - 8} = k{6 - 5};
-
- let Inst{7 - 4} = k{4 - 1};
-
- let Inst{3} = k{0};
+ let Inst{9 - 3} = k;
let Inst{2 - 0} = s;
-
- let DecoderMethod = "decodeCondBranch";
}
class ExtensionPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index 601068bf1793..ce9908597dca 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -29,8 +29,8 @@
namespace llvm {
-AVRInstrInfo::AVRInstrInfo(AVRSubtarget &STI)
- : AVRGenInstrInfo(AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(),
+AVRInstrInfo::AVRInstrInfo(const AVRSubtarget &STI)
+ : AVRGenInstrInfo(STI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(),
STI(STI) {}
void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h
index 1c92f173d254..759aea201096 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.h
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -65,7 +65,7 @@ enum TOF {
/// Utilities related to the AVR instruction set.
class AVRInstrInfo : public AVRGenInstrInfo {
public:
- explicit AVRInstrInfo(AVRSubtarget &STI);
+ explicit AVRInstrInfo(const AVRSubtarget &STI);
const AVRRegisterInfo &getRegisterInfo() const { return RI; }
const MCInstrDesc &getBrCond(AVRCC::CondCodes CC) const;
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 958e1383acef..02fb905f5fb6 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -204,16 +204,19 @@ def memspi : Operand<iPTR> {
def relbrtarget_7 : Operand<OtherVT> {
let PrintMethod = "printPCRelImm";
let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_7_pcrel>";
+ let DecoderMethod = "decodeRelCondBrTarget7";
}
def brtarget_13 : Operand<OtherVT> {
let PrintMethod = "printPCRelImm";
let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+ let DecoderMethod = "decodeRelCondBrTarget13";
}
def rcalltarget_13 : Operand<i16> {
let PrintMethod = "printPCRelImm";
let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+ let DecoderMethod = "decodeRelCondBrTarget13";
}
// The target of a 22 or 16-bit call/jmp instruction.
@@ -492,13 +495,13 @@ let isCommutable = 1, Defs = [R1, R0, SREG] in {
"mul\t$rd, $rr", []>,
Requires<[SupportsMultiplication]>;
- def MULSRdRr : FMUL2RdRr<0, (outs), (ins LD8:$rd, LD8:$rr),
+ def MULSRdRr : FMULSRdRr<(outs), (ins LD8:$rd, LD8:$rr),
"muls\t$rd, $rr", []>,
Requires<[SupportsMultiplication]>;
}
- def MULSURdRr : FMUL2RdRr<1, (outs), (ins LD8lo:$rd, LD8lo:$rr),
- "mulsu\t$rd, $rr", []>,
+ def MULSURdRr : FMULSURdRr<(outs), (ins LD8lo:$rd, LD8lo:$rr),
+ "mulsu\t$rd, $rr", []>,
Requires<[SupportsMultiplication]>;
def FMUL : FFMULRdRr<0b01, (outs), (ins LD8lo:$rd, LD8lo:$rr),
@@ -1230,7 +1233,9 @@ let Uses = [R1, R0] in {
let Defs = [R31R30] in
def SPMZPi : F16<0b1001010111111000, (outs), (ins ZREG:$z), "spm $z+", []>,
- Requires<[HasSPMX]>;
+ Requires<[HasSPMX]> {
+ bits<0> z;
+ }
}
// Read data from IO location operations.
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td
index 21b4aedea44c..182f92c684dc 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -68,33 +68,37 @@ def R31 : AVRReg<31, "r31", [], ["zh"]>, DwarfRegNum<[31]>;
def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>;
def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>;
+// 16 bit GPR pairs.
let SubRegIndices = [sub_lo, sub_hi], CoveredBySubRegs = 1 in {
- // 16 bit GPR pairs.
- def SP : AVRReg<32, "SP", [SPL, SPH]>, DwarfRegNum<[32]>;
+ // The value 16 for the encoding is arbitrary. SP register is not encoded
+ // into instructions, they use it implicitly depending on the opcode.
+ def SP : AVRReg<16, "SP", [SPL, SPH]>, DwarfRegNum<[32]>;
// The pointer registers (X,Y,Z) are a special case because they
// are printed as a `high:low` pair when a DREG is expected,
// but printed using `X`, `Y`, `Z` when a pointer register is expected.
+ // DREG registers are only used in ADIW, SBIW and MOVW instructions.
let RegAltNameIndices = [ptr] in {
- def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
- def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
- def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
+ def R31R30 : AVRReg<15, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
+ def R29R28 : AVRReg<14, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
+ def R27R26 : AVRReg<13, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
}
- def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>;
- def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>;
- def R21R20 : AVRReg<20, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>;
- def R19R18 : AVRReg<18, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>;
- def R17R16 : AVRReg<16, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>;
- def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>;
- def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>;
- def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>;
- def R9R8 : AVRReg<8, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>;
- def R7R6 : AVRReg<6, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>;
- def R5R4 : AVRReg<4, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>;
- def R3R2 : AVRReg<2, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>;
+ def R25R24 : AVRReg<12, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>;
+ def R23R22 : AVRReg<11, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>;
+ def R21R20 : AVRReg<10, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>;
+ def R19R18 : AVRReg<9, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>;
+ def R17R16 : AVRReg<8, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>;
+ def R15R14 : AVRReg<7, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>;
+ def R13R12 : AVRReg<6, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>;
+ def R11R10 : AVRReg<5, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>;
+ def R9R8 : AVRReg<4, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>;
+ def R7R6 : AVRReg<3, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>;
+ def R5R4 : AVRReg<2, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>;
+ def R3R2 : AVRReg<1, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>;
def R1R0 : AVRReg<0, "r1:r0", [R0, R1]>, DwarfRegNum<[0]>;
- // Pseudo registers for unaligned i16
+ // Pseudo registers for unaligned i16. These are only used in pseudo
+ // instructions, so encoding values are arbitrary.
def R26R25 : AVRReg<25, "r26:r25", [R25, R26]>, DwarfRegNum<[25]>;
def R24R23 : AVRReg<23, "r24:r23", [R23, R24]>, DwarfRegNum<[23]>;
def R22R21 : AVRReg<21, "r22:r21", [R21, R22]>, DwarfRegNum<[21]>;
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index 4e00b192b875..a8650146e988 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -38,7 +38,6 @@ using namespace llvm;
namespace {
/// Parses AVR assembly from a stream.
class AVRAsmParser : public MCTargetAsmParser {
- const MCSubtargetInfo &STI;
MCAsmParser &Parser;
const MCRegisterInfo *MRI;
const std::string GENERATE_STUBS = "gs";
@@ -93,7 +92,7 @@ class AVRAsmParser : public MCTargetAsmParser {
public:
AVRAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
- : MCTargetAsmParser(Options, STI, MII), STI(STI), Parser(Parser) {
+ : MCTargetAsmParser(Options, STI, MII), Parser(Parser) {
MCAsmParserExtension::Initialize(Parser);
MRI = getContext().getRegisterInfo();
@@ -318,7 +317,7 @@ bool AVRAsmParser::missingFeature(llvm::SMLoc const &Loc,
bool AVRAsmParser::emit(MCInst &Inst, SMLoc const &Loc, MCStreamer &Out) const {
Inst.setLoc(Loc);
- Out.emitInstruction(Inst, STI);
+ Out.emitInstruction(Inst, *STI);
return false;
}
@@ -411,7 +410,7 @@ bool AVRAsmParser::tryParseRegisterOperand(OperandVector &Operands) {
// Reject R0~R15 on avrtiny.
if (AVR::R0 <= Reg && Reg <= AVR::R15 &&
- STI.hasFeature(AVR::FeatureTinyEncoding))
+ STI->hasFeature(AVR::FeatureTinyEncoding))
return Error(Parser.getTok().getLoc(), "invalid register on avrtiny");
AsmToken const &T = Parser.getTok();
@@ -758,7 +757,7 @@ unsigned AVRAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
// Reject R0~R15 on avrtiny.
if (0 <= RegNum && RegNum <= 15 &&
- STI.hasFeature(AVR::FeatureTinyEncoding))
+ STI->hasFeature(AVR::FeatureTinyEncoding))
return Match_InvalidRegisterOnTiny;
std::ostringstream RegName;
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 948588cb9a75..3a840a371497 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -61,7 +61,7 @@ LLVMInitializeAVRDisassembler() {
createAVRDisassembler);
}
-static const uint16_t GPRDecoderTable[] = {
+static constexpr MCRegister GPRDecoderTable[] = {
AVR::R0, AVR::R1, AVR::R2, AVR::R3, AVR::R4, AVR::R5, AVR::R6,
AVR::R7, AVR::R8, AVR::R9, AVR::R10, AVR::R11, AVR::R12, AVR::R13,
AVR::R14, AVR::R15, AVR::R16, AVR::R17, AVR::R18, AVR::R19, AVR::R20,
@@ -69,6 +69,13 @@ static const uint16_t GPRDecoderTable[] = {
AVR::R28, AVR::R29, AVR::R30, AVR::R31,
};
+static constexpr MCRegister GPRPairDecoderTable[] = {
+ AVR::R1R0, AVR::R3R2, AVR::R5R4, AVR::R7R6,
+ AVR::R9R8, AVR::R11R10, AVR::R13R12, AVR::R15R14,
+ AVR::R17R16, AVR::R19R18, AVR::R21R20, AVR::R23R22,
+ AVR::R25R24, AVR::R27R26, AVR::R29R28, AVR::R31R30,
+};
+
static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -83,96 +90,41 @@ static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeLD8RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const MCDisassembler *Decoder) {
- if (RegNo > 15)
- return MCDisassembler::Fail;
-
- unsigned Register = GPRDecoderTable[RegNo + 16];
- Inst.addOperand(MCOperand::createReg(Register));
+ assert(isUInt<4>(RegNo));
+ // Only r16...r31 are legal.
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[16 + RegNo]));
return MCDisassembler::Success;
}
-static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeMemri(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFBRk(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeCondBranch(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-#include "AVRGenDisassemblerTables.inc"
+static DecodeStatus DecodeLD8loRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ assert(isUInt<3>(RegNo));
+ // Only r16...r23 are legal.
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[16 + RegNo]));
+ return MCDisassembler::Success;
+}
-static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned addr = 0;
- addr |= fieldFromInstruction(Insn, 0, 4);
- addr |= fieldFromInstruction(Insn, 9, 2) << 4;
- unsigned reg = fieldFromInstruction(Insn, 4, 5);
- Inst.addOperand(MCOperand::createImm(addr));
- if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
+static DecodeStatus DecodeDREGSRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ assert(isUInt<4>(RegNo));
+ Inst.addOperand(MCOperand::createReg(GPRPairDecoderTable[RegNo]));
return MCDisassembler::Success;
}
-static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned addr = 0;
- addr |= fieldFromInstruction(Insn, 0, 4);
- addr |= fieldFromInstruction(Insn, 9, 2) << 4;
- unsigned reg = fieldFromInstruction(Insn, 4, 5);
- if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
- Inst.addOperand(MCOperand::createImm(addr));
+static DecodeStatus DecodeIWREGSRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ assert(isUInt<2>(RegNo));
+ // Only AVR::R25R24, AVR::R27R26, AVR::R29R28, AVR::R31R30 are legal.
+ Inst.addOperand(MCOperand::createReg(GPRPairDecoderTable[12 + RegNo]));
return MCDisassembler::Success;
}
-static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned addr = fieldFromInstruction(Insn, 3, 5);
- unsigned b = fieldFromInstruction(Insn, 0, 3);
- Inst.addOperand(MCOperand::createImm(addr));
- Inst.addOperand(MCOperand::createImm(b));
+static DecodeStatus DecodeZREGRegisterClass(MCInst &Inst,
+ const MCDisassembler *Decoder) {
+ Inst.addOperand(MCOperand::createReg(AVR::R31R30));
return MCDisassembler::Success;
}
@@ -185,78 +137,19 @@ static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Field,
return MCDisassembler::Success;
}
-static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned d = fieldFromInstruction(Insn, 4, 5);
- if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- if (decodeFRd(Inst, Insn, Address, Decoder) == MCDisassembler::Fail)
- return MCDisassembler::Fail;
- Inst.addOperand(MCOperand::createReg(AVR::R31R30));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned d = fieldFromInstruction(Insn, 4, 3) + 16;
- unsigned r = fieldFromInstruction(Insn, 0, 3) + 16;
- if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
- if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned r = fieldFromInstruction(Insn, 4, 4) * 2;
- unsigned d = fieldFromInstruction(Insn, 0, 4) * 2;
- if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
- if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned d = fieldFromInstruction(Insn, 4, 2) * 2 + 24; // starts at r24:r25
- unsigned k = 0;
- k |= fieldFromInstruction(Insn, 0, 4);
- k |= fieldFromInstruction(Insn, 6, 2) << 4;
- if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
- if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
- Inst.addOperand(MCOperand::createImm(k));
+static DecodeStatus decodeRelCondBrTarget7(MCInst &Inst, unsigned Field,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ // The legal range is [-128, 126] (in bytes).
+ Inst.addOperand(MCOperand::createImm(SignExtend32(Field, 7) * 2));
return MCDisassembler::Success;
}
-static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned rd = fieldFromInstruction(Insn, 4, 4) + 16;
- unsigned rr = fieldFromInstruction(Insn, 0, 4) + 16;
- if (DecodeGPR8RegisterClass(Inst, rd, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
- if (DecodeGPR8RegisterClass(Inst, rr, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
+static DecodeStatus decodeRelCondBrTarget13(MCInst &Inst, unsigned Field,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ // The legal range is [-4096, 4094] (in bytes).
+ Inst.addOperand(MCOperand::createImm(SignExtend32(Field, 12) * 2));
return MCDisassembler::Success;
}
@@ -277,59 +170,6 @@ static DecodeStatus decodeMemri(MCInst &Inst, unsigned Insn, uint64_t Address,
return MCDisassembler::Success;
}
-static DecodeStatus decodeFBRk(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- // Decode the opcode.
- switch (Insn & 0xf000) {
- case 0xc000:
- Inst.setOpcode(AVR::RJMPk);
- break;
- case 0xd000:
- Inst.setOpcode(AVR::RCALLk);
- break;
- default: // Unknown relative branch instruction.
- return MCDisassembler::Fail;
- }
- // Decode the relative offset.
- int16_t Offset = ((int16_t)((Insn & 0xfff) << 4)) >> 3;
- Inst.addOperand(MCOperand::createImm(Offset));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeCondBranch(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- // These 8 instructions are not defined as aliases of BRBS/BRBC.
- DenseMap<unsigned, unsigned> brInsts = {
- {0x000, AVR::BRLOk}, {0x400, AVR::BRSHk}, {0x001, AVR::BREQk},
- {0x401, AVR::BRNEk}, {0x002, AVR::BRMIk}, {0x402, AVR::BRPLk},
- {0x004, AVR::BRLTk}, {0x404, AVR::BRGEk}};
-
- // Get the relative offset.
- int16_t Offset = ((int16_t)((Insn & 0x3f8) << 6)) >> 8;
-
- // Search the instruction pattern.
- auto NotAlias = [&Insn](const std::pair<unsigned, unsigned> &I) {
- return (Insn & 0x407) != I.first;
- };
- llvm::partition(brInsts, NotAlias);
- auto It = llvm::partition_point(brInsts, NotAlias);
-
- // Decode the instruction.
- if (It != brInsts.end()) {
- // This instruction is not an alias of BRBC/BRBS.
- Inst.setOpcode(It->second);
- Inst.addOperand(MCOperand::createImm(Offset));
- } else {
- // Fall back to an ordinary BRBS/BRBC.
- Inst.setOpcode(Insn & 0x400 ? AVR::BRBCsk : AVR::BRBSsk);
- Inst.addOperand(MCOperand::createImm(Insn & 7));
- Inst.addOperand(MCOperand::createImm(Offset));
- }
-
- return MCDisassembler::Success;
-}
-
static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -435,6 +275,8 @@ static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn,
return MCDisassembler::Success;
}
+#include "AVRGenDisassemblerTables.inc"
+
static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
uint64_t &Size, uint32_t &Insn) {
if (Bytes.size() < 2) {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
index 481219164a0f..5adffeed04bd 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
@@ -101,23 +101,6 @@ const char *AVRInstPrinter::getPrettyRegisterName(MCRegister Reg,
void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
const MCOperandInfo &MOI = this->MII.get(MI->getOpcode()).operands()[OpNo];
- if (MOI.RegClass == AVR::ZREGRegClassID) {
- // Special case for the Z register, which sometimes doesn't have an operand
- // in the MCInst.
- O << "Z";
- return;
- }
-
- if (OpNo >= MI->size()) {
- // Not all operands are correctly disassembled at the moment. This means
- // that some machine instructions won't have all the necessary operands
- // set.
- // To avoid asserting, print <unknown> instead until the necessary support
- // has been implemented.
- O << "<unknown>";
- return;
- }
-
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg()) {
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 352017e9b929..dadba52de462 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -193,27 +193,6 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
switch (Opcode) {
default:
break;
- case ISD::INTRINSIC_W_CHAIN: {
- unsigned IntNo = Node->getConstantOperandVal(1);
- switch (IntNo) {
- case Intrinsic::bpf_load_byte:
- case Intrinsic::bpf_load_half:
- case Intrinsic::bpf_load_word: {
- SDLoc DL(Node);
- SDValue Chain = Node->getOperand(0);
- SDValue N1 = Node->getOperand(1);
- SDValue Skb = Node->getOperand(2);
- SDValue N3 = Node->getOperand(3);
-
- SDValue R6Reg = CurDAG->getRegister(BPF::R6, MVT::i64);
- Chain = CurDAG->getCopyToReg(Chain, DL, R6Reg, Skb, SDValue());
- Node = CurDAG->UpdateNodeOperands(Node, Chain, N1, R6Reg, N3);
- break;
- }
- }
- break;
- }
-
case ISD::FrameIndex: {
int FI = cast<FrameIndexSDNode>(Node)->getIndex();
EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 70bc163615f6..fb4efcfe8614 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -12,6 +12,7 @@
#include "BPFInstrInfo.h"
#include "BPF.h"
+#include "BPFSubtarget.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -25,8 +26,8 @@
using namespace llvm;
-BPFInstrInfo::BPFInstrInfo()
- : BPFGenInstrInfo(BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
+BPFInstrInfo::BPFInstrInfo(const BPFSubtarget &STI)
+ : BPFGenInstrInfo(STI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h
index d8bbad44e314..2359e43e483f 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -20,12 +20,13 @@
#include "BPFGenInstrInfo.inc"
namespace llvm {
+class BPFSubtarget;
class BPFInstrInfo : public BPFGenInstrInfo {
const BPFRegisterInfo RI;
public:
- BPFInstrInfo();
+ explicit BPFInstrInfo(const BPFSubtarget &STI);
const BPFRegisterInfo &getRegisterInfo() const { return RI; }
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index b21f1a0eee3b..de7dae2c8ca6 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -1189,10 +1189,9 @@ let Defs = [R0, R1, R2, R3, R4, R5], Uses = [R6], hasSideEffects = 1,
hasExtraDefRegAllocReq = 1, hasExtraSrcRegAllocReq = 1, mayLoad = 1 in {
class LOAD_ABS<BPFWidthModifer SizeOp, string OpcodeStr, Intrinsic OpNode>
: TYPE_LD_ST<BPF_ABS.Value, SizeOp.Value,
- (outs),
- (ins GPR:$skb, i64imm:$imm),
+ (outs), (ins i64imm:$imm),
"r0 = *("#OpcodeStr#" *)skb[$imm]",
- [(set R0, (OpNode GPR:$skb, i64immSExt32:$imm))]> {
+ [(set R0, (OpNode R6, i64immSExt32:$imm))]> {
bits<32> imm;
let Inst{31-0} = imm;
@@ -1201,10 +1200,9 @@ class LOAD_ABS<BPFWidthModifer SizeOp, string OpcodeStr, Intrinsic OpNode>
class LOAD_IND<BPFWidthModifer SizeOp, string OpcodeStr, Intrinsic OpNode>
: TYPE_LD_ST<BPF_IND.Value, SizeOp.Value,
- (outs),
- (ins GPR:$skb, GPR:$val),
+ (outs), (ins GPR:$val),
"r0 = *("#OpcodeStr#" *)skb[$val]",
- [(set R0, (OpNode GPR:$skb, GPR:$val))]> {
+ [(set R0, (OpNode R6, GPR:$val))]> {
bits<4> val;
let Inst{55-52} = val;
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
index 4167547680b1..a7ecc39fad7b 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -103,7 +103,7 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM)
: BPFGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
- FrameLowering(initializeSubtargetDependencies(CPU, FS)),
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)), FrameLowering(*this),
TLInfo(TM, *this) {
IsLittleEndian = TT.isLittleEndian();
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index bed6bc98b167..ba4b48990c64 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -235,7 +235,7 @@ void BTFTypeEnum64::completeType(BTFDebug &BDebug) {
BTFEnum.NameOff = BDebug.addString(Enum->getName());
uint64_t Value;
if (Enum->isUnsigned())
- Value = static_cast<uint64_t>(Enum->getValue().getZExtValue());
+ Value = Enum->getValue().getZExtValue();
else
Value = static_cast<uint64_t>(Enum->getValue().getSExtValue());
BTFEnum.Val_Lo32 = Value;
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index b5bb1c08c564..230cf3b0ddbe 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -205,18 +205,6 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
Op.setImm(Make_64(Hi, Op.getImm()));
break;
}
- case BPF::LD_ABS_B:
- case BPF::LD_ABS_H:
- case BPF::LD_ABS_W:
- case BPF::LD_IND_B:
- case BPF::LD_IND_H:
- case BPF::LD_IND_W: {
- auto Op = Instr.getOperand(0);
- Instr.clear();
- Instr.addOperand(MCOperand::createReg(BPF::R6));
- Instr.addOperand(Op);
- break;
- }
}
return Result;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats.td b/llvm/lib/Target/CSKY/CSKYInstrFormats.td
index 5296d282c689..abf5cac0013d 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormats.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormats.td
@@ -168,7 +168,9 @@ class I_16_RET<bits<5> sop, bits<5> pcode, string op, list<dag> pattern>
// Instructions(3): cmpnei32, cmphsi32, cmplti32
class I_16_X<bits<5> sop, string op, Operand operand>
: CSKY32Inst<AddrModeNone, 0x3a, (outs CARRY:$ca),
- (ins GPR:$rx, operand:$imm16), !strconcat(op, "\t$rx, $imm16"), []> {
+ (ins GPR:$rx, operand:$imm16),
+ !strconcat(op, "\t$rx, $imm16"), []> {
+ bits<0> ca;
bits<16> imm16;
bits<5> rx;
let Inst{25 - 21} = sop;
@@ -263,8 +265,9 @@ class I_12_PP<bits<5> sop, bits<5> pcode, dag outs, dag ins, string op>
class I_5_ZX<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
list<dag> pattern>
: CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
- (ins CARRY:$cond, GPR:$false, GPR:$rx, ImmType:$imm5),
- !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
+ (ins CARRY:$cond, GPR:$false, GPR:$rx, ImmType:$imm5),
+ !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
+ bits<0> cond;
bits<5> rz;
bits<5> rx;
bits<5> imm5;
@@ -469,9 +472,10 @@ class I_5_XZ_UZ<bits<6> sop, bits<5> lsb, bits<5> msb, string op, int v>
// Instructions(1): btsti32
class I_5_X<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
list<dag> pattern>
- : CSKY32Inst<AddrModeNone, 0x31,
- (outs CARRY:$ca), (ins GPR:$rx, ImmType:$imm5),
- !strconcat(op, "\t$rx, $imm5"), pattern> {
+ : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca),
+ (ins GPR:$rx, ImmType:$imm5),
+ !strconcat(op, "\t$rx, $imm5"), pattern> {
+ bits<0> ca;
bits<5> imm5;
bits<5> rx;
let Inst{25 - 21} = imm5;
@@ -581,9 +585,9 @@ class R_XXZ<bits<6> sop, bits<5> pcode, dag outs, dag ins, string op,
// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] >
// Instructions:(4) cmpne32, cmphs32, cmplt32, tst32
class R_YX<bits<6> sop, bits<5> pcode, string op>
- : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca),
- (ins GPR:$rx, GPR:$ry),
+ : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), (ins GPR:$rx, GPR:$ry),
!strconcat(op, "\t$rx, $ry"), []> {
+ bits<0> ca;
bits<5> ry;
bits<5> rx;
let Inst{25 - 21} = ry;
@@ -642,8 +646,9 @@ class R_X<bits<6> sop, bits<5> pcode, dag outs, dag ins, string op, list<dag> pa
// Format< OP[6] | 00000[5] | 00000[5] | SOP[6] | PCODE[5] | RZ[5] >
// Instructions:(2) mvc32, mvcv32
class R_Z_1<bits<6> sop, bits<5> pcode, string op>
- : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
- (ins CARRY:$ca), !strconcat(op, "\t$rz"), []> {
+ : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), (ins CARRY:$ca),
+ !strconcat(op, "\t$rz"), []> {
+ bits<0> ca;
bits<5> rz;
let Inst{25 - 21} = 0;
let Inst{20 - 16} = 0;
@@ -656,7 +661,8 @@ class R_Z_1<bits<6> sop, bits<5> pcode, string op>
// Instructions:(2) clrf32, clrt32
class R_Z_2<bits<6> sop, bits<5> pcode, string op>
: CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
- (ins CARRY:$ca, GPR:$false), !strconcat(op, "\t$rz"), []> {
+ (ins CARRY:$ca, GPR:$false), !strconcat(op, "\t$rz"), []> {
+ bits<0> ca;
bits<5> rz;
let Inst{25 - 21} = rz;
let Inst{20 - 16} = 0;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
index ea0761d97545..5cd970d27d47 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
@@ -16,8 +16,9 @@ class J16<bits<5> sop, string opstr, dag ins>
}
class J16_B<bits<5> sop, string opstr>
- : CSKY16Inst<AddrModeNone, (outs), (ins CARRY:$ca, br_symbol_16bit:$offset),
- !strconcat(opstr, "\t$offset"), []> {
+ : CSKY16Inst<AddrModeNone, (outs), (ins CARRY:$ca, br_symbol_16bit:$offset),
+ !strconcat(opstr, "\t$offset"), []> {
+ bits<0> ca;
bits<10> offset;
let Inst{15} = 0;
let Inst{14 - 10} = sop;
@@ -66,6 +67,8 @@ class R16_XZ_BINOP_NOPat<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
class R16_XZ_BINOP_C<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
AddrModeNone, (outs sGPR:$rz, CARRY:$cout),
(ins sGPR:$rZ, sGPR:$rx, CARRY:$cin), !strconcat(opstr, "\t$rz, $rx"), []> {
+ bits<0> cout;
+ bits<0> cin;
bits<4> rz;
bits<4> rx;
let Inst{15, 14} = 0b01;
@@ -101,9 +104,10 @@ class R16_Z_UNOP<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
let Constraints = "$rz = $rx";
}
-class R16_XY_CMP<bits<2> sop, string opstr> : CSKY16Inst<
- AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry), !strconcat(opstr, "\t$rx, $ry"),
- []> {
+class R16_XY_CMP<bits<2> sop, string opstr>
+ : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry),
+ !strconcat(opstr, "\t$rx, $ry"), []> {
+ bits<0> ca;
bits<4> ry;
bits<4> rx;
let Inst{15, 14} = 0b01;
@@ -145,9 +149,11 @@ class I16_Z_5<bits<3> sop, dag outs, dag ins,string opstr>
let Inst{4 - 0} = imm5;
}
-class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> : CSKY16Inst<
- AddrModeNone, (outs CARRY:$ca), (ins mGPR:$rx, Immoperand:$imm5),
- !strconcat(opstr, "\t$rx, $imm5"), []> {
+class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand>
+ : CSKY16Inst<AddrModeNone, (outs CARRY:$ca),
+ (ins mGPR:$rx, Immoperand:$imm5),
+ !strconcat(opstr, "\t$rx, $imm5"), []> {
+ bits<0> ca;
bits<3> rx;
bits<5> imm5;
let Inst{15, 14} = 0b00;
@@ -158,9 +164,12 @@ class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> : CSKY16Inst<
let isCompare = 1;
}
-class I16_SP_IMM7<bits<3> sop, string opstr> : CSKY16Inst<
- AddrModeNone, (outs GPRSP:$sp2), (ins GPRSP:$sp1, uimm7_2:$imm7),
- !strconcat(opstr, "\t$sp2, $sp1, $imm7"), []> {
+class I16_SP_IMM7<bits<3> sop, string opstr>
+ : CSKY16Inst<AddrModeNone, (outs GPRSP:$sp2),
+ (ins GPRSP:$sp1, uimm7_2:$imm7),
+ !strconcat(opstr, "\t$sp2, $sp1, $imm7"), []> {
+ bits<0> sp2;
+ bits<0> sp1;
bits<7> imm7;
let Inst{15, 14} = 0b00;
let Inst{13 - 10} = 0b0101;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td
index 446670a4d0a9..a40874b054d8 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td
@@ -91,15 +91,21 @@ multiclass FT_XZ<bits<6> sop, string op, PatFrag opnode> {
}
let vrz = 0, isCompare = 1 in {
-class F_CMPXY<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype>
- : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), !strconcat(op#op_su, "\t$vrx, $vry"),
- []>;
-
-let vry = 0 in{
-class F_CMPZX<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype>
- : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx), !strconcat(op#op_su, "\t$vrx"),
- []>;
-}
+ class F_CMPXY<bits<5> datatype, bits<6> sop, string op, string op_su,
+ RegisterOperand regtype>
+ : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca),
+ (ins regtype:$vrx, regtype:$vry),
+ !strconcat(op#op_su, "\t$vrx, $vry"), []> {
+ bits<0> ca;
+ }
+
+ let vry = 0 in
+ class F_CMPZX<bits<5> datatype, bits<6> sop, string op, string op_su,
+ RegisterOperand regtype>
+ : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx),
+ !strconcat(op#op_su, "\t$vrx"), []> {
+ bits<0> ca;
+ }
}
class F_XYZ<bits<5> datatype, bits<6> sop, string op, string op_su, PatFrag opnode, RegisterOperand regtype>
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td
index 641ad623f140..bd7c554565cd 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td
@@ -91,8 +91,9 @@ multiclass F2_XZ_SET_T<bits<6> sop, string op, string suffix = ""> {
let vrz = 0, isCompare = 1 in
class F2_CXY<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
: F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx, $vry"),
- (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry),
- []>;
+ (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), []> {
+ bits<0> ca;
+}
multiclass F2_CXY_T<bits<6> sop, string op> {
def _S : F2_CXY<0b00000, FPR32Op, sop, op#".32">;
@@ -103,9 +104,10 @@ multiclass F2_CXY_T<bits<6> sop, string op> {
let vrz = 0, vry = 0, isCompare = 1 in
class F2_CX<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
- : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx"),
- (outs CARRY:$ca), (ins regtype:$vrx),
- []>;
+ : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx"), (outs CARRY:$ca),
+ (ins regtype:$vrx), []> {
+ bits<0> ca;
+}
multiclass F2_CX_T<bits<6> sop, string op> {
def _S : F2_CX<0b00000, FPR32Op, sop, op#".32">;
@@ -183,7 +185,10 @@ class F2_LDSTR_D<bits<1> sop, string op, dag outs, dag ins>
class F2_CXYZ<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
: F2_XYZ<datatype, sop, !strconcat(op, "\t$vrz, $vrx, $vry"),
(outs regtype:$vrz), (ins CARRY:$ca, regtype:$vrx, regtype:$vry),
- []>;
+ []> {
+ bits<0> ca;
+}
+
multiclass F2_CXYZ_T<bits<6> sop, string op> {
def _S : F2_CXYZ<0b00000, FPR32Op, sop, op#".32">;
let Predicates = [HasFPUv3_DF] in
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index ccb3f16394d4..619a797be6dc 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -24,8 +24,9 @@ using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "CSKYGenInstrInfo.inc"
-CSKYInstrInfo::CSKYInstrInfo(CSKYSubtarget &STI)
- : CSKYGenInstrInfo(CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) {
+CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI)
+ : CSKYGenInstrInfo(STI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP),
+ STI(STI) {
v2sf = STI.hasFPUv2SingleFloat();
v2df = STI.hasFPUv2DoubleFloat();
v3sf = STI.hasFPUv3SingleFloat();
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
index 98f583e8b405..6451c0af14fc 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
@@ -33,7 +33,7 @@ protected:
const CSKYSubtarget &STI;
public:
- explicit CSKYInstrInfo(CSKYSubtarget &STI);
+ explicit CSKYInstrInfo(const CSKYSubtarget &STI);
Register isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
index c6bfc2495ae2..82e271e5b556 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -586,14 +586,23 @@ let Predicates = [iHasE2] in {
BinOpFrag<(rotl node:$LHS, (and node:$RHS, 0x1f))>, "rotl32">;
def BMASKI32 : I_5_Z<0b010100, 0x1, "bmaski32", oimm5, []>;
- def LSLC32 : I_5_XZ<0x13, 0x1, "lslc32",
- (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>;
- def LSRC32 : I_5_XZ<0x13, 0x2, "lsrc32",
- (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>;
- def ASRC32 : I_5_XZ<0x13, 0x4, "asrc32",
- (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>;
- def XSR32 : I_5_XZ<0x13, 0x8, "xsr32",
- (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5, CARRY:$cin), []>;
+ def LSLC32 : I_5_XZ<0x13, 0x1, "lslc32", (outs GPR:$rz, CARRY:$cout),
+ (ins GPR:$rx, oimm5:$imm5), []> {
+ bits<0> cout;
+ }
+ def LSRC32 : I_5_XZ<0x13, 0x2, "lsrc32", (outs GPR:$rz, CARRY:$cout),
+ (ins GPR:$rx, oimm5:$imm5), []> {
+ bits<0> cout;
+ }
+ def ASRC32 : I_5_XZ<0x13, 0x4, "asrc32", (outs GPR:$rz, CARRY:$cout),
+ (ins GPR:$rx, oimm5:$imm5), []> {
+ bits<0> cout;
+ }
+ def XSR32 : I_5_XZ<0x13, 0x8, "xsr32", (outs GPR:$rz, CARRY:$cout),
+ (ins GPR:$rx, oimm5:$imm5, CARRY:$cin), []> {
+ bits<0> cout;
+ bits<0> cin;
+ }
def IXH32 : R_YXZ_SP_F1<0x2, 0x1,
BinOpFrag<(add node:$LHS, (shl node:$RHS, (i32 1)))>, "ixh32">;
@@ -605,9 +614,15 @@ let Predicates = [iHasE2] in {
let isCommutable = 1, isAdd = 1 in
def ADDC32 : R_YXZ<0x31, 0x0, 0x2, (outs GPR:$rz, CARRY:$cout),
- (ins GPR:$rx, GPR:$ry, CARRY:$cin), "addc32", []>;
+ (ins GPR:$rx, GPR:$ry, CARRY:$cin), "addc32", []> {
+ bits<0> cout;
+ bits<0> cin;
+ }
def SUBC32 : R_YXZ<0x31, 0x0, 0x8, (outs GPR:$rz, CARRY:$cout),
- (ins GPR:$rx, GPR:$ry, CARRY:$cin), "subc32", []>;
+ (ins GPR:$rx, GPR:$ry, CARRY:$cin), "subc32", []> {
+ bits<0> cout;
+ bits<0> cin;
+ }
def INCF32 : I_5_ZX<0x3, 0x1, "incf32", uimm5, []>;
def INCT32 : I_5_ZX<0x3, 0x2, "inct32", uimm5, []>;
@@ -621,12 +636,18 @@ let Predicates = [iHas2E3] in {
def DIVU32 : R_YXZ_SP_F1<0x20, 0x1,
BinOpFrag<(udiv node:$LHS, node:$RHS)>, "divu32">;
- def DECGT32 : I_5_XZ<0x4, 0x1, "decgt32",
- (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
- def DECLT32 : I_5_XZ<0x4, 0x2, "declt32",
- (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
- def DECNE32 : I_5_XZ<0x4, 0x4, "decne32",
- (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
+ def DECGT32 : I_5_XZ<0x4, 0x1, "decgt32", (outs GPR:$rz, CARRY:$cout),
+ (ins GPR:$rx, uimm5:$imm5), []> {
+ bits<0> cout;
+ }
+ def DECLT32 : I_5_XZ<0x4, 0x2, "declt32", (outs GPR:$rz, CARRY:$cout),
+ (ins GPR:$rx, uimm5:$imm5), []> {
+ bits<0> cout;
+ }
+ def DECNE32 : I_5_XZ<0x4, 0x4, "decne32", (outs GPR:$rz, CARRY:$cout),
+ (ins GPR:$rx, uimm5:$imm5), []> {
+ bits<0> cout;
+ }
def SEXT32 : I_5_XZ_U<0x16, (outs GPR:$rz), (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), "sext32", []>;
let isCodeGenOnly = 1 in {
@@ -744,8 +765,9 @@ let Predicates = [iHas2E3] in {
def CMPHS32 : R_YX<0x1, 0x1, "cmphs32">;
def CMPLT32 : R_YX<0x1, 0x2, "cmplt32">;
- def SETC32 : CSKY32Inst<AddrModeNone, 0x31,
- (outs CARRY:$ca), (ins), "setc32", []> {
+ def SETC32 : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), (ins), "setc32",
+ []> {
+ bits<0> ca;
let Inst{25 - 21} = 0; //rx
let Inst{20 - 16} = 0; //ry
let Inst{15 - 10} = 0x1;
@@ -753,8 +775,9 @@ let Predicates = [iHas2E3] in {
let Inst{4 - 0} = 0;
let isCompare = 1;
}
- def CLRC32 : CSKY32Inst<AddrModeNone, 0x31,
- (outs CARRY:$ca), (ins), "clrc32", []> {
+ def CLRC32 : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), (ins), "clrc32",
+ []> {
+ bits<0> ca;
let Inst{25 - 21} = 0; //rx
let Inst{20 - 16} = 0; //ry
let Inst{15 - 10} = 0x1;
@@ -764,8 +787,10 @@ let Predicates = [iHas2E3] in {
}
def TST32 : R_YX<0x8, 0x4, "tst32">;
- def TSTNBZ32 : R_X<0x8, 0x8,
- (outs CARRY:$ca), (ins GPR:$rx), "tstnbz32", []>;
+ def TSTNBZ32 : R_X<0x8, 0x8, (outs CARRY:$ca), (ins GPR:$rx), "tstnbz32",
+ []> {
+ bits<0> ca;
+ }
}
//===----------------------------------------------------------------------===//
@@ -806,9 +831,14 @@ let isBranch = 1, isTerminator = 1 in {
[(br bb:$imm16)]>;
def BT32 : I_16_L<0x3, (outs), (ins CARRY:$ca, br_symbol:$imm16),
- "bt32\t$imm16", [(brcond CARRY:$ca, bb:$imm16)]>, Requires<[iHasE2]>;
+ "bt32\t$imm16", [(brcond CARRY:$ca, bb:$imm16)]>,
+ Requires<[iHasE2]> {
+ bits<0> ca;
+ }
def BF32 : I_16_L<0x2, (outs), (ins CARRY:$ca, br_symbol:$imm16),
- "bf32\t$imm16", []>, Requires<[iHasE2]>;
+ "bf32\t$imm16", []>, Requires<[iHasE2]> {
+ bits<0> ca;
+ }
}
let Predicates = [iHas2E3] in {
@@ -1030,7 +1060,10 @@ def SE32 : I_5_XZ_PRIVI<0b010110, 0x1, "se32">;
def WSC32 : I_5_XZ_PRIVI<0b001111, 0x1, "wsc32">;
def CPOP32 : I_CPOP<(outs), (ins uimm5:$cpid, uimm20:$usdef), "cpop32 <$cpid, ${usdef}>">;
-def CPRC32 : I_CP<0b0100, (outs CARRY:$ca), (ins uimm5:$cpid, uimm12:$usdef), "cprc32 <$cpid, ${usdef}>">;
+def CPRC32 : I_CP<0b0100, (outs CARRY:$ca), (ins uimm5:$cpid, uimm12:$usdef),
+ "cprc32 <$cpid, ${usdef}>"> {
+ bits<0> ca;
+}
def CPRCR32 : I_CP_Z<0b0010, (outs GPR:$rz), (ins uimm5:$cpid, uimm12:$usdef), "cprcr32 $rz, <$cpid, ${usdef}>">;
def CPRGR32 : I_CP_Z<0b0000, (outs GPR:$rz), (ins uimm5:$cpid, uimm12:$usdef), "cprgr32 $rz, <$cpid, ${usdef}>">;
def CPWCR32 : I_CP_Z<0b0011, (outs), (ins GPR:$rz, uimm5:$cpid, uimm12:$usdef), "cpwcr32 $rz, <$cpid, ${usdef}>">;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
index 3e248019d73f..51645215f32a 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
@@ -102,7 +102,9 @@ def : Pat<(add GPR:$rs1, (oimm8_neg:$im)),
let isAdd = 1 in
def ADDI16ZSP : I16_Z_8<0b011, (ins GPRSP:$sp, uimm8_2:$imm8),
- "addi16\t$rz, $sp, $imm8">;
+ "addi16\t$rz, $sp, $imm8"> {
+ bits<0> sp;
+}
let isAdd = 1 in
def ADDI16SPSP : I16_SP_IMM7<0b000,"addi16">;
@@ -142,10 +144,14 @@ def ST16H : I16_XZ_LDST<AddrMode16H, 0b101, "st16.h",
def ST16W : I16_XZ_LDST<AddrMode16W, 0b110, "st16.w",
(outs), (ins mGPR:$rz, mGPR:$rx, uimm5_2:$imm)>;
-def LD16WSP : I16_ZSP_LDST<AddrMode16W, 0b011, "ld16.w",
- (outs mGPR:$rz), (ins GPRSP:$sp, uimm8_2:$addr)>;
-def ST16WSP : I16_ZSP_LDST<AddrMode16W, 0b111, "st16.w",
- (outs), (ins mGPR:$rz, GPRSP:$sp, uimm8_2:$addr)>;
+def LD16WSP : I16_ZSP_LDST<AddrMode16W, 0b011, "ld16.w", (outs mGPR:$rz),
+ (ins GPRSP:$sp, uimm8_2:$addr)> {
+ bits<0> sp;
+}
+def ST16WSP : I16_ZSP_LDST<AddrMode16W, 0b111, "st16.w", (outs),
+ (ins mGPR:$rz, GPRSP:$sp, uimm8_2:$addr)> {
+ bits<0> sp;
+}
//===----------------------------------------------------------------------===//
// Compare instructions.
@@ -187,8 +193,9 @@ def MOV16 : CSKY16Inst<AddrModeNone, (outs sGPR:$rz), (ins sGPR:$rx),
}
// MVC16 is not in "cskyv2 instructions reference manul"
-def MVCV16 : CSKY16Inst<AddrModeNone,
- (outs sGPR:$rz), (ins CARRY:$ca), "mvcv16\t$rz", []> {
+def MVCV16 : CSKY16Inst<AddrModeNone, (outs sGPR:$rz), (ins CARRY:$ca),
+ "mvcv16\t$rz", []> {
+ bits<0> ca;
bits<4> rz;
let Inst{15,14} = 0b01;
let Inst{13 - 10} = 0b1001;
@@ -317,11 +324,14 @@ let Constraints = "$rZ = $rz" in {
}
let Predicates = [HasBTST16] in
- def BTSTI16 : I16_Z_5<0b110, (outs CARRY:$ca), (ins mGPR:$rz, uimm5:$imm5),
- "btsti16">;
+def BTSTI16 : I16_Z_5<0b110, (outs CARRY:$ca), (ins mGPR:$rz, uimm5:$imm5),
+ "btsti16"> {
+ bits<0> ca;
+}
def TST16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry),
- "tst16\t$rx, $ry", []> {
+ "tst16\t$rx, $ry", []> {
+ bits<0> ca;
bits<4> ry;
bits<4> rx;
let Inst{15,14} = 0b01;
@@ -334,6 +344,7 @@ def TST16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry),
def TSTNBZ16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx),
"tstnbz16\t$rx", []> {
+ bits<0> ca;
bits<4> rx;
let Inst{15,14} = 0b01;
let Inst{13 - 10} = 0b1010;
diff --git a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
index 749127f4ddc8..887e28127953 100644
--- a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
+++ b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
@@ -36,8 +36,6 @@ class CSKYDisassembler : public MCDisassembler {
std::unique_ptr<MCInstrInfo const> const MCII;
mutable StringRef symbolName;
- DecodeStatus handleCROperand(MCInst &Instr) const;
-
public:
CSKYDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
MCInstrInfo const *MCII);
@@ -198,15 +196,9 @@ static DecodeStatus DecodemGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
return MCDisassembler::Success;
}
-// TODO
-LLVM_ATTRIBUTE_UNUSED
-static DecodeStatus DecodeGPRSPRegisterClass(MCInst &Inst, uint64_t RegNo,
- uint64_t Address,
+static DecodeStatus DecodeGPRSPRegisterClass(MCInst &Inst,
const MCDisassembler *Decoder) {
- if (RegNo != 14)
- return MCDisassembler::Fail;
-
- Inst.addOperand(MCOperand::createReg(GPRDecoderTable[RegNo]));
+ Inst.addOperand(MCOperand::createReg(CSKY::R14));
return MCDisassembler::Success;
}
@@ -224,6 +216,12 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, uint64_t RegNo,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeCARRYRegisterClass(MCInst &Inst,
+ const MCDisassembler *Decoder) {
+ Inst.addOperand(MCOperand::createReg(CSKY::C));
+ return MCDisassembler::Success;
+}
+
template <unsigned N, unsigned S>
static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
int64_t Address,
@@ -378,121 +376,6 @@ static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
#include "CSKYGenDisassemblerTables.inc"
-DecodeStatus CSKYDisassembler::handleCROperand(MCInst &MI) const {
-
- // FIXME: To query instruction info from td file or a table inc file
- switch (MI.getOpcode()) {
- default:
- return MCDisassembler::Success;
- case CSKY::LD16WSP:
- case CSKY::ST16WSP:
- case CSKY::ADDI16ZSP:
- MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::R14));
- return MCDisassembler::Success;
- case CSKY::ADDI16SPSP:
- case CSKY::SUBI16SPSP:
- MI.insert(MI.begin(), MCOperand::createReg(CSKY::R14));
- MI.insert(MI.begin(), MCOperand::createReg(CSKY::R14));
- return MCDisassembler::Success;
- case CSKY::FCMPHS_S:
- case CSKY::FCMPHS_D:
- case CSKY::FCMPLT_S:
- case CSKY::FCMPLT_D:
- case CSKY::FCMPNE_S:
- case CSKY::FCMPNE_D:
- case CSKY::FCMPUO_S:
- case CSKY::FCMPUO_D:
- case CSKY::FCMPZHS_S:
- case CSKY::FCMPZHS_D:
- case CSKY::FCMPZLS_S:
- case CSKY::FCMPZLS_D:
- case CSKY::FCMPZNE_S:
- case CSKY::FCMPZNE_D:
- case CSKY::FCMPZUO_S:
- case CSKY::FCMPZUO_D:
- case CSKY::f2FCMPHS_S:
- case CSKY::f2FCMPHS_D:
- case CSKY::f2FCMPLT_S:
- case CSKY::f2FCMPLT_D:
- case CSKY::f2FCMPNE_S:
- case CSKY::f2FCMPNE_D:
- case CSKY::f2FCMPUO_S:
- case CSKY::f2FCMPUO_D:
- case CSKY::f2FCMPHSZ_S:
- case CSKY::f2FCMPHSZ_D:
- case CSKY::f2FCMPHZ_S:
- case CSKY::f2FCMPHZ_D:
- case CSKY::f2FCMPLSZ_S:
- case CSKY::f2FCMPLSZ_D:
- case CSKY::f2FCMPLTZ_S:
- case CSKY::f2FCMPLTZ_D:
- case CSKY::f2FCMPNEZ_S:
- case CSKY::f2FCMPNEZ_D:
- case CSKY::f2FCMPUOZ_S:
- case CSKY::f2FCMPUOZ_D:
-
- case CSKY::BT32:
- case CSKY::BF32:
- case CSKY::BT16:
- case CSKY::BF16:
- case CSKY::CMPNEI32:
- case CSKY::CMPNEI16:
- case CSKY::CMPNE32:
- case CSKY::CMPNE16:
- case CSKY::CMPHSI32:
- case CSKY::CMPHSI16:
- case CSKY::CMPHS32:
- case CSKY::CMPHS16:
- case CSKY::CMPLTI32:
- case CSKY::CMPLTI16:
- case CSKY::CMPLT32:
- case CSKY::CMPLT16:
- case CSKY::BTSTI32:
- case CSKY::BTSTI16:
- case CSKY::TSTNBZ32:
- case CSKY::TSTNBZ16:
- case CSKY::TST32:
- case CSKY::TST16:
- MI.insert(MI.begin(), MCOperand::createReg(CSKY::C));
- return MCDisassembler::Success;
- case CSKY::LSLC32:
- case CSKY::LSRC32:
- case CSKY::ASRC32:
- MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C));
- return MCDisassembler::Success;
- case CSKY::MOVF32:
- case CSKY::MOVT32:
- case CSKY::MVC32:
- case CSKY::MVCV32:
- case CSKY::MVCV16:
- case CSKY::INCT32:
- case CSKY::INCF32:
- case CSKY::DECT32:
- case CSKY::DECF32:
- case CSKY::DECGT32:
- case CSKY::DECLT32:
- case CSKY::DECNE32:
- case CSKY::CLRF32:
- case CSKY::CLRT32:
- case CSKY::f2FSEL_S:
- case CSKY::f2FSEL_D:
- MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C));
- return MCDisassembler::Success;
- case CSKY::ADDC32:
- case CSKY::ADDC16:
- case CSKY::SUBC32:
- case CSKY::SUBC16:
- case CSKY::XSR32:
- MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C));
- MI.insert(MI.end(), MCOperand::createReg(CSKY::C));
- return MCDisassembler::Success;
- case CSKY::INS32:
- MI.getOperand(3).setImm(MI.getOperand(3).getImm() +
- MI.getOperand(4).getImm());
- return MCDisassembler::Success;
- }
-}
-
static bool decodeFPUV3Instruction(MCInst &MI, uint32_t insn, uint64_t Address,
const MCDisassembler *DisAsm,
const MCSubtargetInfo &STI) {
@@ -548,7 +431,10 @@ DecodeStatus CSKYDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Size = 2;
}
- handleCROperand(MI);
+ if (MI.getOpcode() == CSKY::INS32) {
+ MI.getOperand(3).setImm(MI.getOperand(3).getImm() +
+ MI.getOperand(4).getImm());
+ }
return Result;
}
diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index 8100f941c8d9..6c079517e22d 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -41,6 +41,7 @@ add_llvm_target(DirectXCodeGen
LINK_COMPONENTS
Analysis
AsmPrinter
+ BinaryFormat
CodeGen
CodeGenTypes
Core
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index a1ef2578f00a..ca81d30473c0 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -158,12 +158,15 @@ void DXContainerGlobals::addRootSignature(Module &M,
if (MMI.ShaderProfile == llvm::Triple::Library)
return;
- assert(MMI.EntryPropertyVec.size() == 1);
-
auto &RSA = getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
- const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry;
- const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction);
+ const Function *EntryFunction = nullptr;
+ if (MMI.ShaderProfile != llvm::Triple::RootSignature) {
+ assert(MMI.EntryPropertyVec.size() == 1);
+ EntryFunction = MMI.EntryPropertyVec[0].Entry;
+ }
+
+ const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction);
if (!RS)
return;
@@ -258,7 +261,8 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
dxil::ModuleMetadataInfo &MMI =
getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
assert(MMI.EntryPropertyVec.size() == 1 ||
- MMI.ShaderProfile == Triple::Library);
+ MMI.ShaderProfile == Triple::Library ||
+ MMI.ShaderProfile == Triple::RootSignature);
PSV.BaseData.ShaderStage =
static_cast<uint8_t>(MMI.ShaderProfile - Triple::Pixel);
@@ -279,7 +283,8 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
break;
}
- if (MMI.ShaderProfile != Triple::Library)
+ if (MMI.ShaderProfile != Triple::Library &&
+ MMI.ShaderProfile != Triple::RootSignature)
PSV.EntryName = MMI.EntryPropertyVec[0].Entry->getName();
PSV.finalize(MMI.ShaderProfile);
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index feecfc0880e2..d507d71b99fc 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -343,9 +343,7 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
GOp->replaceAllUsesWith(NewGEP);
- if (auto *CE = dyn_cast<ConstantExpr>(GOp))
- CE->destroyConstant();
- else if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp))
+ if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp))
OldGEPI->eraseFromParent();
return true;
diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
index 13e3408815bb..aa16e795dc76 100644
--- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
+++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
@@ -22,11 +22,13 @@ static bool finalizeLinkage(Module &M) {
// Convert private globals and external globals with no usage to internal
// linkage.
- for (GlobalVariable &GV : M.globals())
+ for (GlobalVariable &GV : M.globals()) {
+ GV.removeDeadConstantUsers();
if (GV.hasPrivateLinkage() || (GV.hasExternalLinkage() && GV.use_empty())) {
GV.setLinkage(GlobalValue::InternalLinkage);
MadeChange = true;
}
+ }
SmallVector<Function *> Funcs;
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index ee1db54446cb..e2469d8df957 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -51,6 +51,150 @@ static bool resourceAccessNeeds64BitExpansion(Module *M, Type *OverloadTy,
return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
}
+static Value *expand16BitIsInf(CallInst *Orig) {
+ Module *M = Orig->getModule();
+ if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9))
+ return nullptr;
+
+ Value *Val = Orig->getOperand(0);
+ Type *ValTy = Val->getType();
+ if (!ValTy->getScalarType()->isHalfTy())
+ return nullptr;
+
+ IRBuilder<> Builder(Orig);
+ Type *IType = Type::getInt16Ty(M->getContext());
+ Constant *PosInf =
+ ValTy->isVectorTy()
+ ? ConstantVector::getSplat(
+ ElementCount::getFixed(
+ cast<FixedVectorType>(ValTy)->getNumElements()),
+ ConstantInt::get(IType, 0x7c00))
+ : ConstantInt::get(IType, 0x7c00);
+
+ Constant *NegInf =
+ ValTy->isVectorTy()
+ ? ConstantVector::getSplat(
+ ElementCount::getFixed(
+ cast<FixedVectorType>(ValTy)->getNumElements()),
+ ConstantInt::get(IType, 0xfc00))
+ : ConstantInt::get(IType, 0xfc00);
+
+ Value *IVal = Builder.CreateBitCast(Val, PosInf->getType());
+ Value *B1 = Builder.CreateICmpEQ(IVal, PosInf);
+ Value *B2 = Builder.CreateICmpEQ(IVal, NegInf);
+ Value *B3 = Builder.CreateOr(B1, B2);
+ return B3;
+}
+
+static Value *expand16BitIsNaN(CallInst *Orig) {
+ Module *M = Orig->getModule();
+ if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9))
+ return nullptr;
+
+ Value *Val = Orig->getOperand(0);
+ Type *ValTy = Val->getType();
+ if (!ValTy->getScalarType()->isHalfTy())
+ return nullptr;
+
+ IRBuilder<> Builder(Orig);
+ Type *IType = Type::getInt16Ty(M->getContext());
+
+ Constant *ExpBitMask =
+ ValTy->isVectorTy()
+ ? ConstantVector::getSplat(
+ ElementCount::getFixed(
+ cast<FixedVectorType>(ValTy)->getNumElements()),
+ ConstantInt::get(IType, 0x7c00))
+ : ConstantInt::get(IType, 0x7c00);
+ Constant *SigBitMask =
+ ValTy->isVectorTy()
+ ? ConstantVector::getSplat(
+ ElementCount::getFixed(
+ cast<FixedVectorType>(ValTy)->getNumElements()),
+ ConstantInt::get(IType, 0x3ff))
+ : ConstantInt::get(IType, 0x3ff);
+
+ Constant *Zero =
+ ValTy->isVectorTy()
+ ? ConstantVector::getSplat(
+ ElementCount::getFixed(
+ cast<FixedVectorType>(ValTy)->getNumElements()),
+ ConstantInt::get(IType, 0))
+ : ConstantInt::get(IType, 0);
+
+ Value *IVal = Builder.CreateBitCast(Val, ExpBitMask->getType());
+ Value *Exp = Builder.CreateAnd(IVal, ExpBitMask);
+ Value *B1 = Builder.CreateICmpEQ(Exp, ExpBitMask);
+
+ Value *Sig = Builder.CreateAnd(IVal, SigBitMask);
+ Value *B2 = Builder.CreateICmpNE(Sig, Zero);
+ Value *B3 = Builder.CreateAnd(B1, B2);
+ return B3;
+}
+
+static Value *expand16BitIsFinite(CallInst *Orig) {
+ Module *M = Orig->getModule();
+ if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9))
+ return nullptr;
+
+ Value *Val = Orig->getOperand(0);
+ Type *ValTy = Val->getType();
+ if (!ValTy->getScalarType()->isHalfTy())
+ return nullptr;
+
+ IRBuilder<> Builder(Orig);
+ Type *IType = Type::getInt16Ty(M->getContext());
+
+ Constant *ExpBitMask =
+ ValTy->isVectorTy()
+ ? ConstantVector::getSplat(
+ ElementCount::getFixed(
+ cast<FixedVectorType>(ValTy)->getNumElements()),
+ ConstantInt::get(IType, 0x7c00))
+ : ConstantInt::get(IType, 0x7c00);
+
+ Value *IVal = Builder.CreateBitCast(Val, ExpBitMask->getType());
+ Value *Exp = Builder.CreateAnd(IVal, ExpBitMask);
+ Value *B1 = Builder.CreateICmpNE(Exp, ExpBitMask);
+ return B1;
+}
+
+static Value *expand16BitIsNormal(CallInst *Orig) {
+ Module *M = Orig->getModule();
+ if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9))
+ return nullptr;
+
+ Value *Val = Orig->getOperand(0);
+ Type *ValTy = Val->getType();
+ if (!ValTy->getScalarType()->isHalfTy())
+ return nullptr;
+
+ IRBuilder<> Builder(Orig);
+ Type *IType = Type::getInt16Ty(M->getContext());
+
+ Constant *ExpBitMask =
+ ValTy->isVectorTy()
+ ? ConstantVector::getSplat(
+ ElementCount::getFixed(
+ cast<FixedVectorType>(ValTy)->getNumElements()),
+ ConstantInt::get(IType, 0x7c00))
+ : ConstantInt::get(IType, 0x7c00);
+ Constant *Zero =
+ ValTy->isVectorTy()
+ ? ConstantVector::getSplat(
+ ElementCount::getFixed(
+ cast<FixedVectorType>(ValTy)->getNumElements()),
+ ConstantInt::get(IType, 0))
+ : ConstantInt::get(IType, 0);
+
+ Value *IVal = Builder.CreateBitCast(Val, ExpBitMask->getType());
+ Value *Exp = Builder.CreateAnd(IVal, ExpBitMask);
+ Value *NotAllZeroes = Builder.CreateICmpNE(Exp, Zero);
+ Value *NotAllOnes = Builder.CreateICmpNE(Exp, ExpBitMask);
+ Value *B1 = Builder.CreateAnd(NotAllZeroes, NotAllOnes);
+ return B1;
+}
+
static bool isIntrinsicExpansion(Function &F) {
switch (F.getIntrinsicID()) {
case Intrinsic::abs:
@@ -68,6 +212,7 @@ static bool isIntrinsicExpansion(Function &F) {
case Intrinsic::dx_sclamp:
case Intrinsic::dx_nclamp:
case Intrinsic::dx_degrees:
+ case Intrinsic::dx_isinf:
case Intrinsic::dx_lerp:
case Intrinsic::dx_normalize:
case Intrinsic::dx_fdot:
@@ -301,13 +446,16 @@ static Value *expandIsFPClass(CallInst *Orig) {
auto *TCI = dyn_cast<ConstantInt>(T);
// These FPClassTest cases have DXIL opcodes, so they will be handled in
- // DXIL Op Lowering instead.
+ // DXIL Op Lowering instead for all non f16 cases.
switch (TCI->getZExtValue()) {
case FPClassTest::fcInf:
+ return expand16BitIsInf(Orig);
case FPClassTest::fcNan:
+ return expand16BitIsNaN(Orig);
case FPClassTest::fcNormal:
+ return expand16BitIsNormal(Orig);
case FPClassTest::fcFinite:
- return nullptr;
+ return expand16BitIsFinite(Orig);
}
IRBuilder<> Builder(Orig);
@@ -873,6 +1021,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
case Intrinsic::dx_degrees:
Result = expandDegreesIntrinsic(Orig);
break;
+ case Intrinsic::dx_isinf:
+ Result = expand16BitIsInf(Orig);
+ break;
case Intrinsic::dx_lerp:
Result = expandLerpIntrinsic(Orig);
break;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index bd421771e8ed..577b4624458b 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -220,7 +220,7 @@ public:
removeResourceGlobals(CI);
- auto *NameGlobal = dyn_cast<llvm::GlobalVariable>(CI->getArgOperand(5));
+ auto *NameGlobal = dyn_cast<llvm::GlobalVariable>(CI->getArgOperand(4));
CI->replaceAllUsesWith(Replacement);
CI->eraseFromParent();
@@ -233,6 +233,7 @@ public:
IRBuilder<> &IRB = OpBuilder.getIRB();
Type *Int8Ty = IRB.getInt8Ty();
Type *Int32Ty = IRB.getInt32Ty();
+ Type *Int1Ty = IRB.getInt1Ty();
return replaceFunction(F, [&](CallInst *CI) -> Error {
IRB.SetInsertPoint(CI);
@@ -249,10 +250,13 @@ public:
IndexOp = IRB.CreateAdd(IndexOp,
ConstantInt::get(Int32Ty, Binding.LowerBound));
+ // FIXME: The last argument is a NonUniform flag which needs to be set
+ // based on resource analysis.
+ // https://github.com/llvm/llvm-project/issues/155701
std::array<Value *, 4> Args{
ConstantInt::get(Int8Ty, llvm::to_underlying(RC)),
ConstantInt::get(Int32Ty, Binding.RecordID), IndexOp,
- CI->getArgOperand(4)};
+ ConstantInt::get(Int1Ty, false)};
Expected<CallInst *> OpCall =
OpBuilder.tryCreateOp(OpCode::CreateHandle, Args, CI->getName());
if (Error E = OpCall.takeError())
@@ -267,6 +271,7 @@ public:
[[nodiscard]] bool lowerToBindAndAnnotateHandle(Function &F) {
IRBuilder<> &IRB = OpBuilder.getIRB();
Type *Int32Ty = IRB.getInt32Ty();
+ Type *Int1Ty = IRB.getInt1Ty();
return replaceFunction(F, [&](CallInst *CI) -> Error {
IRB.SetInsertPoint(CI);
@@ -295,7 +300,11 @@ public:
: Binding.LowerBound + Binding.Size - 1;
Constant *ResBind = OpBuilder.getResBind(Binding.LowerBound, UpperBound,
Binding.Space, RC);
- std::array<Value *, 3> BindArgs{ResBind, IndexOp, CI->getArgOperand(4)};
+ // FIXME: The last argument is a NonUniform flag which needs to be set
+ // based on resource analysis.
+ // https://github.com/llvm/llvm-project/issues/155701
+ Constant *NonUniform = ConstantInt::get(Int1Ty, false);
+ std::array<Value *, 3> BindArgs{ResBind, IndexOp, NonUniform};
Expected<CallInst *> OpBind = OpBuilder.tryCreateOp(
OpCode::CreateHandleFromBinding, BindArgs, CI->getName());
if (Error E = OpBind.takeError())
diff --git a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
index be2c7d1ddff3..d02f4b9f7ebc 100644
--- a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
+++ b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
@@ -25,21 +25,6 @@
using namespace llvm;
using namespace llvm::dxil;
-static ResourceClass toResourceClass(dxbc::DescriptorRangeType RangeType) {
- using namespace dxbc;
- switch (RangeType) {
- case DescriptorRangeType::SRV:
- return ResourceClass::SRV;
- case DescriptorRangeType::UAV:
- return ResourceClass::UAV;
- case DescriptorRangeType::CBV:
- return ResourceClass::CBuffer;
- case DescriptorRangeType::Sampler:
- return ResourceClass::Sampler;
- }
- llvm_unreachable("Unknown DescriptorRangeType");
-}
-
static ResourceClass toResourceClass(dxbc::RootParameterType Type) {
using namespace dxbc;
switch (Type) {
@@ -95,7 +80,7 @@ static void reportOverlappingError(Module &M, ResourceInfo R1,
}
static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) {
- bool ErrorFound = false;
+ [[maybe_unused]] bool ErrorFound = false;
for (const auto &ResList :
{DRM.srvs(), DRM.uavs(), DRM.cbuffers(), DRM.samplers()}) {
if (ResList.empty())
@@ -118,10 +103,8 @@ static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) {
"true, yet no overlapping binding was found");
}
-static void
-reportOverlappingRegisters(Module &M,
- const llvm::hlsl::BindingInfoBuilder::Binding &R1,
- const llvm::hlsl::BindingInfoBuilder::Binding &R2) {
+static void reportOverlappingRegisters(Module &M, const llvm::hlsl::Binding &R1,
+ const llvm::hlsl::Binding &R2) {
SmallString<128> Message;
raw_svector_ostream OS(Message);
@@ -133,6 +116,17 @@ reportOverlappingRegisters(Module &M,
M.getContext().diagnose(DiagnosticInfoGeneric(Message));
}
+static void
+reportRegNotBound(Module &M, ResourceClass Class,
+ const llvm::dxil::ResourceInfo::ResourceBinding &Unbound) {
+ SmallString<128> Message;
+ raw_svector_ostream OS(Message);
+ OS << getResourceClassName(Class) << " register " << Unbound.LowerBound
+ << " in space " << Unbound.Space
+ << " does not have a binding in the Root Signature";
+ M.getContext().diagnose(DiagnosticInfoGeneric(Message));
+}
+
static dxbc::ShaderVisibility
tripleToVisibility(llvm::Triple::EnvironmentType ET) {
switch (ET) {
@@ -157,22 +151,23 @@ tripleToVisibility(llvm::Triple::EnvironmentType ET) {
static void validateRootSignature(Module &M,
const mcdxbc::RootSignatureDesc &RSD,
- dxil::ModuleMetadataInfo &MMI) {
+ dxil::ModuleMetadataInfo &MMI,
+ DXILResourceMap &DRM,
+ DXILResourceTypeMap &DRTM) {
hlsl::BindingInfoBuilder Builder;
dxbc::ShaderVisibility Visibility = tripleToVisibility(MMI.ShaderProfile);
for (const mcdxbc::RootParameterInfo &ParamInfo : RSD.ParametersContainer) {
dxbc::ShaderVisibility ParamVisibility =
- static_cast<dxbc::ShaderVisibility>(ParamInfo.Header.ShaderVisibility);
+ dxbc::ShaderVisibility(ParamInfo.Visibility);
if (ParamVisibility != dxbc::ShaderVisibility::All &&
ParamVisibility != Visibility)
continue;
- dxbc::RootParameterType ParamType =
- static_cast<dxbc::RootParameterType>(ParamInfo.Header.ParameterType);
+ dxbc::RootParameterType ParamType = dxbc::RootParameterType(ParamInfo.Type);
switch (ParamType) {
case dxbc::RootParameterType::Constants32Bit: {
- dxbc::RTS0::v1::RootConstants Const =
+ mcdxbc::RootConstants Const =
RSD.ParametersContainer.getConstant(ParamInfo.Location);
Builder.trackBinding(dxil::ResourceClass::CBuffer, Const.RegisterSpace,
Const.ShaderRegister, Const.ShaderRegister,
@@ -183,12 +178,11 @@ static void validateRootSignature(Module &M,
case dxbc::RootParameterType::SRV:
case dxbc::RootParameterType::UAV:
case dxbc::RootParameterType::CBV: {
- dxbc::RTS0::v2::RootDescriptor Desc =
+ mcdxbc::RootDescriptor Desc =
RSD.ParametersContainer.getRootDescriptor(ParamInfo.Location);
- Builder.trackBinding(toResourceClass(static_cast<dxbc::RootParameterType>(
- ParamInfo.Header.ParameterType)),
- Desc.RegisterSpace, Desc.ShaderRegister,
- Desc.ShaderRegister, &ParamInfo);
+ Builder.trackBinding(toResourceClass(ParamInfo.Type), Desc.RegisterSpace,
+ Desc.ShaderRegister, Desc.ShaderRegister,
+ &ParamInfo);
break;
}
@@ -196,16 +190,13 @@ static void validateRootSignature(Module &M,
const mcdxbc::DescriptorTable &Table =
RSD.ParametersContainer.getDescriptorTable(ParamInfo.Location);
- for (const dxbc::RTS0::v2::DescriptorRange &Range : Table.Ranges) {
+ for (const mcdxbc::DescriptorRange &Range : Table.Ranges) {
uint32_t UpperBound =
Range.NumDescriptors == ~0U
? Range.BaseShaderRegister
: Range.BaseShaderRegister + Range.NumDescriptors - 1;
- Builder.trackBinding(
- toResourceClass(
- static_cast<dxbc::DescriptorRangeType>(Range.RangeType)),
- Range.RegisterSpace, Range.BaseShaderRegister, UpperBound,
- &ParamInfo);
+ Builder.trackBinding(Range.RangeType, Range.RegisterSpace,
+ Range.BaseShaderRegister, UpperBound, &ParamInfo);
}
break;
}
@@ -218,11 +209,19 @@ static void validateRootSignature(Module &M,
Builder.calculateBindingInfo(
[&M](const llvm::hlsl::BindingInfoBuilder &Builder,
- const llvm::hlsl::BindingInfoBuilder::Binding &ReportedBinding) {
- const llvm::hlsl::BindingInfoBuilder::Binding &Overlaping =
+ const llvm::hlsl::Binding &ReportedBinding) {
+ const llvm::hlsl::Binding &Overlaping =
Builder.findOverlapping(ReportedBinding);
reportOverlappingRegisters(M, ReportedBinding, Overlaping);
});
+ const hlsl::BoundRegs &BoundRegs = Builder.takeBoundRegs();
+ for (const ResourceInfo &RI : DRM) {
+ const ResourceInfo::ResourceBinding &Binding = RI.getBinding();
+ ResourceClass RC = DRTM[RI.getHandleTy()].getResourceClass();
+ if (!BoundRegs.isBound(RC, Binding.Space, Binding.LowerBound,
+ Binding.LowerBound + Binding.Size - 1))
+ reportRegNotBound(M, RC, Binding);
+ }
}
static mcdxbc::RootSignatureDesc *
@@ -236,7 +235,8 @@ getRootSignature(RootSignatureBindingInfo &RSBI,
static void reportErrors(Module &M, DXILResourceMap &DRM,
DXILResourceBindingInfo &DRBI,
RootSignatureBindingInfo &RSBI,
- dxil::ModuleMetadataInfo &MMI) {
+ dxil::ModuleMetadataInfo &MMI,
+ DXILResourceTypeMap &DRTM) {
if (DRM.hasInvalidCounterDirection())
reportInvalidDirection(M, DRM);
@@ -247,7 +247,7 @@ static void reportErrors(Module &M, DXILResourceMap &DRM,
"DXILResourceImplicitBinding pass");
if (mcdxbc::RootSignatureDesc *RSD = getRootSignature(RSBI, MMI))
- validateRootSignature(M, *RSD, MMI);
+ validateRootSignature(M, *RSD, MMI, DRM, DRTM);
}
PreservedAnalyses
@@ -256,8 +256,9 @@ DXILPostOptimizationValidation::run(Module &M, ModuleAnalysisManager &MAM) {
DXILResourceBindingInfo &DRBI = MAM.getResult<DXILResourceBindingAnalysis>(M);
RootSignatureBindingInfo &RSBI = MAM.getResult<RootSignatureAnalysis>(M);
ModuleMetadataInfo &MMI = MAM.getResult<DXILMetadataAnalysis>(M);
+ DXILResourceTypeMap &DRTM = MAM.getResult<DXILResourceTypeAnalysis>(M);
- reportErrors(M, DRM, DRBI, RSBI, MMI);
+ reportErrors(M, DRM, DRBI, RSBI, MMI, DRTM);
return PreservedAnalyses::all();
}
@@ -273,8 +274,10 @@ public:
getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
dxil::ModuleMetadataInfo &MMI =
getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
+ DXILResourceTypeMap &DRTM =
+ getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap();
- reportErrors(M, DRM, DRBI, RSBI, MMI);
+ reportErrors(M, DRM, DRBI, RSBI, MMI, DRTM);
return false;
}
StringRef getPassName() const override {
@@ -288,6 +291,7 @@ public:
AU.addRequired<DXILResourceBindingWrapperPass>();
AU.addRequired<DXILMetadataAnalysisWrapperPass>();
AU.addRequired<RootSignatureAnalysisWrapper>();
+ AU.addRequired<DXILResourceTypeWrapperPass>();
AU.addPreserved<DXILResourceWrapperPass>();
AU.addPreserved<DXILResourceBindingWrapperPass>();
AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
@@ -305,6 +309,7 @@ INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
+INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass)
INITIALIZE_PASS_END(DXILPostOptimizationValidationLegacy, DEBUG_TYPE,
"DXIL Post Optimization Validation", false, false)
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index c33ec0efd73c..6579d3405cf3 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -8,14 +8,19 @@
#include "DXILResourceAccess.h"
#include "DirectX.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/DXILResource.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsDirectX.h"
+#include "llvm/IR/User.h"
#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
#define DEBUG_TYPE "dxil-resource-access"
@@ -198,6 +203,112 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
llvm_unreachable("Unhandled case in switch");
}
+static SmallVector<Instruction *> collectBlockUseDef(Instruction *Start) {
+ SmallPtrSet<Instruction *, 32> Visited;
+ SmallVector<Instruction *, 32> Worklist;
+ SmallVector<Instruction *> Out;
+ auto *BB = Start->getParent();
+
+ // Seed with direct users in this block.
+ for (User *U : Start->users()) {
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ if (I->getParent() == BB)
+ Worklist.push_back(I);
+ }
+ }
+
+ // BFS over transitive users, constrained to the same block.
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+ if (!Visited.insert(I).second)
+ continue;
+ Out.push_back(I);
+
+ for (User *U : I->users()) {
+ if (auto *J = dyn_cast<Instruction>(U)) {
+ if (J->getParent() == BB)
+ Worklist.push_back(J);
+ }
+ }
+ for (Use &V : I->operands()) {
+ if (auto *J = dyn_cast<Instruction>(V)) {
+ if (J->getParent() == BB && V != Start)
+ Worklist.push_back(J);
+ }
+ }
+ }
+
+ // Order results in program order.
+ DenseMap<const Instruction *, unsigned> Ord;
+ unsigned Idx = 0;
+ for (Instruction &I : *BB)
+ Ord[&I] = Idx++;
+
+ llvm::sort(Out, [&](Instruction *A, Instruction *B) {
+ return Ord.lookup(A) < Ord.lookup(B);
+ });
+
+ return Out;
+}
+
+static void phiNodeRemapHelper(PHINode *Phi, BasicBlock *BB,
+ IRBuilder<> &Builder,
+ SmallVector<Instruction *> &UsesInBlock) {
+
+ ValueToValueMapTy VMap;
+ Value *Val = Phi->getIncomingValueForBlock(BB);
+ VMap[Phi] = Val;
+ Builder.SetInsertPoint(&BB->back());
+ for (Instruction *I : UsesInBlock) {
+ // don't clone over the Phi just remap them
+ if (auto *PhiNested = dyn_cast<PHINode>(I)) {
+ VMap[PhiNested] = PhiNested->getIncomingValueForBlock(BB);
+ continue;
+ }
+ Instruction *Clone = I->clone();
+ RemapInstruction(Clone, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ Builder.Insert(Clone);
+ VMap[I] = Clone;
+ }
+}
+
+static void phiNodeReplacement(IntrinsicInst *II,
+ SmallVectorImpl<Instruction *> &PrevBBDeadInsts,
+ SetVector<BasicBlock *> &DeadBB) {
+ SmallVector<Instruction *> CurrBBDeadInsts;
+ for (User *U : II->users()) {
+ auto *Phi = dyn_cast<PHINode>(U);
+ if (!Phi)
+ continue;
+
+ IRBuilder<> Builder(Phi);
+ SmallVector<Instruction *> UsesInBlock = collectBlockUseDef(Phi);
+ bool HasReturnUse = isa<ReturnInst>(UsesInBlock.back());
+
+ for (unsigned I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
+ auto *CurrIncomingBB = Phi->getIncomingBlock(I);
+ phiNodeRemapHelper(Phi, CurrIncomingBB, Builder, UsesInBlock);
+ if (HasReturnUse)
+ PrevBBDeadInsts.push_back(&CurrIncomingBB->back());
+ }
+
+ CurrBBDeadInsts.push_back(Phi);
+
+ for (Instruction *I : UsesInBlock) {
+ CurrBBDeadInsts.push_back(I);
+ }
+ if (HasReturnUse) {
+ BasicBlock *PhiBB = Phi->getParent();
+ DeadBB.insert(PhiBB);
+ }
+ }
+ // Traverse the now-dead instructions in RPO and remove them.
+ for (Instruction *Dead : llvm::reverse(CurrBBDeadInsts))
+ Dead->eraseFromParent();
+ CurrBBDeadInsts.clear();
+}
+
static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) {
// Process users keeping track of indexing accumulated from GEPs.
struct AccessAndOffset {
@@ -229,7 +340,6 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) {
} else if (auto *LI = dyn_cast<LoadInst>(Current.Access)) {
createLoadIntrinsic(II, LI, Current.Offset, RTI);
DeadInsts.push_back(LI);
-
} else
llvm_unreachable("Unhandled instruction - pointer escaped?");
}
@@ -242,13 +352,27 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) {
static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) {
SmallVector<std::pair<IntrinsicInst *, dxil::ResourceTypeInfo>> Resources;
- for (BasicBlock &BB : F)
+ SetVector<BasicBlock *> DeadBB;
+ SmallVector<Instruction *> PrevBBDeadInsts;
+ for (BasicBlock &BB : make_early_inc_range(F)) {
+ for (Instruction &I : make_early_inc_range(BB))
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::dx_resource_getpointer)
+ phiNodeReplacement(II, PrevBBDeadInsts, DeadBB);
+
for (Instruction &I : BB)
if (auto *II = dyn_cast<IntrinsicInst>(&I))
if (II->getIntrinsicID() == Intrinsic::dx_resource_getpointer) {
auto *HandleTy = cast<TargetExtType>(II->getArgOperand(0)->getType());
Resources.emplace_back(II, DRTM[HandleTy]);
}
+ }
+ for (auto *Dead : PrevBBDeadInsts)
+ Dead->eraseFromParent();
+ PrevBBDeadInsts.clear();
+ for (auto *Dead : DeadBB)
+ Dead->eraseFromParent();
+ DeadBB.clear();
for (auto &[II, RI] : Resources)
replaceAccess(II, RI);
@@ -279,7 +403,6 @@ public:
bool runOnFunction(Function &F) override {
DXILResourceTypeMap &DRTM =
getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap();
-
return transformResourcePointers(F, DRTM);
}
StringRef getPassName() const override { return "DXIL Resource Access"; }
diff --git a/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp b/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp
index 6e69c5ac1d63..b0d9ad8da10e 100644
--- a/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp
@@ -111,8 +111,7 @@ static bool assignBindings(Module &M, DXILResourceBindingInfo &DRBI,
RegSlotOp, /* register slot */
IB.Call->getOperand(2), /* size */
IB.Call->getOperand(3), /* index */
- IB.Call->getOperand(4), /* non-uniform flag */
- IB.Call->getOperand(5)}); /* name */
+ IB.Call->getOperand(4)}); /* name */
IB.Call->replaceAllUsesWith(NewCall);
IB.Call->eraseFromParent();
Changed = true;
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index a4f5086c2f42..ac3c7dde6b89 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -24,9 +24,11 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
+#include "llvm/MC/DXContainerRootSignature.h"
#include "llvm/Pass.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ScopedPrinter.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
@@ -70,6 +72,13 @@ analyzeModule(Module &M) {
if (RootSignatureNode == nullptr)
return RSDMap;
+ bool AllowNullFunctions = false;
+ if (M.getTargetTriple().getEnvironment() ==
+ Triple::EnvironmentType::RootSignature) {
+ assert(RootSignatureNode->getNumOperands() == 1);
+ AllowNullFunctions = true;
+ }
+
for (const auto &RSDefNode : RootSignatureNode->operands()) {
if (RSDefNode->getNumOperands() != 3) {
reportError(Ctx, "Invalid Root Signature metadata - expected function, "
@@ -78,24 +87,28 @@ analyzeModule(Module &M) {
}
// Function was pruned during compilation.
- const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0);
- if (FunctionPointerMdNode == nullptr) {
- reportError(
- Ctx, "Function associated with Root Signature definition is null.");
- continue;
- }
+ Function *F = nullptr;
+
+ if (!AllowNullFunctions) {
+ const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0);
+ if (FunctionPointerMdNode == nullptr) {
+ reportError(
+ Ctx, "Function associated with Root Signature definition is null.");
+ continue;
+ }
- ValueAsMetadata *VAM =
- llvm::dyn_cast<ValueAsMetadata>(FunctionPointerMdNode.get());
- if (VAM == nullptr) {
- reportError(Ctx, "First element of root signature is not a Value");
- continue;
- }
+ ValueAsMetadata *VAM =
+ llvm::dyn_cast<ValueAsMetadata>(FunctionPointerMdNode.get());
+ if (VAM == nullptr) {
+ reportError(Ctx, "First element of root signature is not a Value");
+ continue;
+ }
- Function *F = dyn_cast<Function>(VAM->getValue());
- if (F == nullptr) {
- reportError(Ctx, "First element of root signature is not a Function");
- continue;
+ F = dyn_cast<Function>(VAM->getValue());
+ if (F == nullptr) {
+ reportError(Ctx, "First element of root signature is not a Function");
+ continue;
+ }
}
Metadata *RootElementListOperand = RSDefNode->getOperand(1).get();
@@ -171,41 +184,41 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M,
<< "RootParametersOffset: " << RS.RootParameterOffset << "\n"
<< "NumParameters: " << RS.ParametersContainer.size() << "\n";
for (size_t I = 0; I < RS.ParametersContainer.size(); I++) {
- const auto &[Type, Loc] =
- RS.ParametersContainer.getTypeAndLocForParameter(I);
- const dxbc::RTS0::v1::RootParameterHeader Header =
- RS.ParametersContainer.getHeader(I);
-
- OS << "- Parameter Type: " << Type << "\n"
- << " Shader Visibility: " << Header.ShaderVisibility << "\n";
-
- switch (Type) {
- case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): {
- const dxbc::RTS0::v1::RootConstants &Constants =
- RS.ParametersContainer.getConstant(Loc);
+ const mcdxbc::RootParameterInfo &Info = RS.ParametersContainer.getInfo(I);
+
+ OS << "- Parameter Type: "
+ << enumToStringRef(Info.Type, dxbc::getRootParameterTypes()) << "\n"
+ << " Shader Visibility: "
+ << enumToStringRef(Info.Visibility, dxbc::getShaderVisibility())
+ << "\n";
+ switch (Info.Type) {
+ case dxbc::RootParameterType::Constants32Bit: {
+ const mcdxbc::RootConstants &Constants =
+ RS.ParametersContainer.getConstant(Info.Location);
OS << " Register Space: " << Constants.RegisterSpace << "\n"
<< " Shader Register: " << Constants.ShaderRegister << "\n"
<< " Num 32 Bit Values: " << Constants.Num32BitValues << "\n";
break;
}
- case llvm::to_underlying(dxbc::RootParameterType::CBV):
- case llvm::to_underlying(dxbc::RootParameterType::UAV):
- case llvm::to_underlying(dxbc::RootParameterType::SRV): {
- const dxbc::RTS0::v2::RootDescriptor &Descriptor =
- RS.ParametersContainer.getRootDescriptor(Loc);
+ case dxbc::RootParameterType::CBV:
+ case dxbc::RootParameterType::UAV:
+ case dxbc::RootParameterType::SRV: {
+ const mcdxbc::RootDescriptor &Descriptor =
+ RS.ParametersContainer.getRootDescriptor(Info.Location);
OS << " Register Space: " << Descriptor.RegisterSpace << "\n"
<< " Shader Register: " << Descriptor.ShaderRegister << "\n";
if (RS.Version > 1)
OS << " Flags: " << Descriptor.Flags << "\n";
break;
}
- case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
+ case dxbc::RootParameterType::DescriptorTable: {
const mcdxbc::DescriptorTable &Table =
- RS.ParametersContainer.getDescriptorTable(Loc);
+ RS.ParametersContainer.getDescriptorTable(Info.Location);
OS << " NumRanges: " << Table.Ranges.size() << "\n";
- for (const dxbc::RTS0::v2::DescriptorRange Range : Table) {
- OS << " - Range Type: " << Range.RangeType << "\n"
+ for (const mcdxbc::DescriptorRange &Range : Table) {
+ OS << " - Range Type: "
+ << dxil::getResourceClassName(Range.RangeType) << "\n"
<< " Register Space: " << Range.RegisterSpace << "\n"
<< " Base Shader Register: " << Range.BaseShaderRegister << "\n"
<< " Num Descriptors: " << Range.NumDescriptors << "\n"
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 82bcacee7a6d..9eebcc9b1306 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -127,6 +127,8 @@ static StringRef getShortShaderStage(Triple::EnvironmentType Env) {
return "ms";
case Triple::Amplification:
return "as";
+ case Triple::RootSignature:
+ return "rootsig";
default:
break;
}
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 1d79c3018439..bc1a3a7995bd 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -2113,7 +2113,7 @@ void DXILBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
}
break;
case Instruction::GetElementPtr: {
- Code = bitc::CST_CODE_CE_GEP;
+ Code = bitc::CST_CODE_CE_GEP_OLD;
const auto *GO = cast<GEPOperator>(C);
if (GO->isInBounds())
Code = bitc::CST_CODE_CE_INBOUNDS_GEP;
diff --git a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp
index f99bb4f4eaee..c2e139edc6bd 100644
--- a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp
+++ b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp
@@ -15,25 +15,39 @@
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
using namespace llvm;
using namespace llvm::dxil;
namespace {
+Type *classifyFunctionType(const Function &F, PointerTypeMap &Map);
+
// Classifies the type of the value passed in by walking the value's users to
// find a typed instruction to materialize a type from.
Type *classifyPointerType(const Value *V, PointerTypeMap &Map) {
assert(V->getType()->isPointerTy() &&
"classifyPointerType called with non-pointer");
+
+ // A CallInst will trigger this case, and we want to classify its Function
+ // operand as a Function rather than a generic Value.
+ if (const Function *F = dyn_cast<Function>(V))
+ return classifyFunctionType(*F, Map);
+
+ // There can potentially be dead constants hanging off of the globals we do
+ // not want to deal with. So we remove them here.
+ if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+ GV->removeDeadConstantUsers();
+
auto It = Map.find(V);
if (It != Map.end())
return It->second;
Type *PointeeTy = nullptr;
- if (auto *Inst = dyn_cast<GetElementPtrInst>(V)) {
- if (!Inst->getResultElementType()->isPointerTy())
- PointeeTy = Inst->getResultElementType();
+ if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+ if (!GEP->getResultElementType()->isPointerTy())
+ PointeeTy = GEP->getResultElementType();
} else if (auto *Inst = dyn_cast<AllocaInst>(V)) {
PointeeTy = Inst->getAllocatedType();
} else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
@@ -49,8 +63,8 @@ Type *classifyPointerType(const Value *V, PointerTypeMap &Map) {
// When store value is ptr type, cannot get more type info.
if (NewPointeeTy->isPointerTy())
continue;
- } else if (const auto *Inst = dyn_cast<GetElementPtrInst>(User)) {
- NewPointeeTy = Inst->getSourceElementType();
+ } else if (const auto *GEP = dyn_cast<GEPOperator>(User)) {
+ NewPointeeTy = GEP->getSourceElementType();
}
if (NewPointeeTy) {
// HLSL doesn't support pointers, so it is unlikely to get more than one
@@ -204,6 +218,9 @@ PointerTypeMap PointerTypeAnalysis::run(const Module &M) {
for (const auto &I : B) {
if (I.getType()->isPointerTy())
classifyPointerType(&I, Map);
+ for (const auto &O : I.operands())
+ if (O.get()->getType()->isPointerTy())
+ classifyPointerType(O.get(), Map);
}
}
}
diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
index 07b68648f16c..bb2efa43d818 100644
--- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
@@ -11,10 +11,14 @@
//===----------------------------------------------------------------------===//
#include "DirectXInstrInfo.h"
+#include "DirectXSubtarget.h"
#define GET_INSTRINFO_CTOR_DTOR
#include "DirectXGenInstrInfo.inc"
using namespace llvm;
+DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI)
+ : DirectXGenInstrInfo(STI) {}
+
DirectXInstrInfo::~DirectXInstrInfo() {}
diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.h b/llvm/lib/Target/DirectX/DirectXInstrInfo.h
index e2c7036fc74a..57ede28030b2 100644
--- a/llvm/lib/Target/DirectX/DirectXInstrInfo.h
+++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.h
@@ -20,9 +20,11 @@
#include "DirectXGenInstrInfo.inc"
namespace llvm {
+class DirectXSubtarget;
+
struct DirectXInstrInfo : public DirectXGenInstrInfo {
const DirectXRegisterInfo RI;
- explicit DirectXInstrInfo() : DirectXGenInstrInfo() {}
+ explicit DirectXInstrInfo(const DirectXSubtarget &STI);
const DirectXRegisterInfo &getRegisterInfo() const { return RI; }
~DirectXInstrInfo() override;
};
diff --git a/llvm/lib/Target/DirectX/DirectXSubtarget.cpp b/llvm/lib/Target/DirectX/DirectXSubtarget.cpp
index 526b7d29fb13..f8519177cc2d 100644
--- a/llvm/lib/Target/DirectX/DirectXSubtarget.cpp
+++ b/llvm/lib/Target/DirectX/DirectXSubtarget.cpp
@@ -24,6 +24,7 @@ using namespace llvm;
DirectXSubtarget::DirectXSubtarget(const Triple &TT, StringRef CPU,
StringRef FS, const DirectXTargetMachine &TM)
- : DirectXGenSubtargetInfo(TT, CPU, CPU, FS), FL(*this), TL(TM, *this) {}
+ : DirectXGenSubtargetInfo(TT, CPU, CPU, FS), InstrInfo(*this), FL(*this),
+ TL(TM, *this) {}
void DirectXSubtarget::anchor() {}
diff --git a/llvm/lib/Target/DirectX/DirectXSubtarget.h b/llvm/lib/Target/DirectX/DirectXSubtarget.h
index b2374caaf3cd..f3d71c4c4e3b 100644
--- a/llvm/lib/Target/DirectX/DirectXSubtarget.h
+++ b/llvm/lib/Target/DirectX/DirectXSubtarget.h
@@ -28,9 +28,9 @@ namespace llvm {
class DirectXTargetMachine;
class DirectXSubtarget : public DirectXGenSubtargetInfo {
+ DirectXInstrInfo InstrInfo;
DirectXFrameLowering FL;
DirectXTargetLowering TL;
- DirectXInstrInfo InstrInfo;
virtual void anchor(); // virtual anchor method
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index de10092cbe3c..0639878c1256 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -173,6 +173,19 @@ static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
const MCDisassembler *Decoder);
static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
const MCDisassembler *Decoder);
+
+static DecodeStatus n1ConstDecoder(MCInst &MI, const MCDisassembler *Decoder) {
+ MCContext &Ctx = Decoder->getContext();
+ MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(-1, Ctx)));
+ return DecodeStatus::Success;
+}
+
+static DecodeStatus sgp10ConstDecoder(MCInst &MI,
+ const MCDisassembler *Decoder) {
+ MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0));
+ return DecodeStatus::Success;
+}
+
#include "HexagonDepDecoders.inc"
#include "HexagonGenDisassemblerTables.inc"
@@ -349,21 +362,6 @@ void HexagonDisassembler::remapInstruction(MCInst &Instr) const {
}
}
-static void adjustDuplex(MCInst &MI, MCContext &Context) {
- switch (MI.getOpcode()) {
- case Hexagon::SA1_setin1:
- MI.insert(MI.begin() + 1,
- MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
- break;
- case Hexagon::SA1_dec:
- MI.insert(MI.begin() + 2,
- MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
- break;
- default:
- break;
- }
-}
-
DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
ArrayRef<uint8_t> Bytes,
uint64_t Address,
@@ -468,12 +466,10 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
CurrentExtender = TmpExtender;
if (Result != DecodeStatus::Success)
return DecodeStatus::Fail;
- adjustDuplex(*MILow, getContext());
Result = decodeInstruction(
DecodeHigh, *MIHigh, (Instruction >> 16) & 0x1fff, Address, this, STI);
if (Result != DecodeStatus::Success)
return DecodeStatus::Fail;
- adjustDuplex(*MIHigh, getContext());
MCOperand OPLow = MCOperand::createInst(MILow);
MCOperand OPHigh = MCOperand::createInst(MIHigh);
MI.addOperand(OPLow);
@@ -499,41 +495,6 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
}
- switch (MI.getOpcode()) {
- case Hexagon::J4_cmpeqn1_f_jumpnv_nt:
- case Hexagon::J4_cmpeqn1_f_jumpnv_t:
- case Hexagon::J4_cmpeqn1_fp0_jump_nt:
- case Hexagon::J4_cmpeqn1_fp0_jump_t:
- case Hexagon::J4_cmpeqn1_fp1_jump_nt:
- case Hexagon::J4_cmpeqn1_fp1_jump_t:
- case Hexagon::J4_cmpeqn1_t_jumpnv_nt:
- case Hexagon::J4_cmpeqn1_t_jumpnv_t:
- case Hexagon::J4_cmpeqn1_tp0_jump_nt:
- case Hexagon::J4_cmpeqn1_tp0_jump_t:
- case Hexagon::J4_cmpeqn1_tp1_jump_nt:
- case Hexagon::J4_cmpeqn1_tp1_jump_t:
- case Hexagon::J4_cmpgtn1_f_jumpnv_nt:
- case Hexagon::J4_cmpgtn1_f_jumpnv_t:
- case Hexagon::J4_cmpgtn1_fp0_jump_nt:
- case Hexagon::J4_cmpgtn1_fp0_jump_t:
- case Hexagon::J4_cmpgtn1_fp1_jump_nt:
- case Hexagon::J4_cmpgtn1_fp1_jump_t:
- case Hexagon::J4_cmpgtn1_t_jumpnv_nt:
- case Hexagon::J4_cmpgtn1_t_jumpnv_t:
- case Hexagon::J4_cmpgtn1_tp0_jump_nt:
- case Hexagon::J4_cmpgtn1_tp0_jump_t:
- case Hexagon::J4_cmpgtn1_tp1_jump_nt:
- case Hexagon::J4_cmpgtn1_tp1_jump_t:
- MI.insert(MI.begin() + 1,
- MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
- break;
- case Hexagon::Y4_crswap10:
- MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0));
- break;
- default:
- break;
- }
-
if (HexagonMCInstrInfo::isNewValue(*MCII, MI)) {
unsigned OpIndex = HexagonMCInstrInfo::getNewValueOp(*MCII, MI);
MCOperand &MCO = MI.getOperand(OpIndex);
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index 0dbe743d13ed..6d0529fb4277 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -176,8 +176,11 @@ def UseSmallData : Predicate<"HST->useSmallData()">;
def UseCabac : Predicate<"HST->useCabac()">,
AssemblerPredicate<(any_of FeatureCabac)>;
-def Hvx64: HwMode<"+hvx-length64b", [UseHVX64B]>;
-def Hvx128: HwMode<"+hvx-length128b", [UseHVX128B]>;
+def : HwModePredicateProlog<[{
+ const auto *HST = static_cast<const HexagonSubtarget *>(this);
+}]>;
+def Hvx64: HwMode<[UseHVX64B]>;
+def Hvx128: HwMode<[UseHVX128B]>;
//===----------------------------------------------------------------------===//
// Classes used for relation maps.
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
index 75e87c95f2c4..f48695c6ebc0 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -38,11 +38,7 @@ class Enc_041d7b : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{24-23} = n1{3-2};
- let Inst{13-13} = n1{1-1};
- let Inst{8-8} = n1{0-0};
+ bits <0> n1;
}
class Enc_046afa : OpcodeHexagon {
bits <1> Mu2;
@@ -244,10 +240,7 @@ class Enc_14640c : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{24-22} = n1{3-1};
- let Inst{13-13} = n1{0-0};
+ bits <0> n1;
}
class Enc_14d27a : OpcodeHexagon {
bits <5> II;
@@ -300,11 +293,7 @@ class Enc_178717 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <6> n1;
- let Inst{28-28} = n1{5-5};
- let Inst{25-23} = n1{4-2};
- let Inst{13-13} = n1{1-1};
- let Inst{8-8} = n1{0-0};
+ bits <0> n1;
}
class Enc_179b35 : OpcodeHexagon {
bits <5> Rs32;
@@ -384,9 +373,7 @@ class Enc_1de724 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <4> n1;
- let Inst{28-28} = n1{3-3};
- let Inst{24-22} = n1{2-0};
+ bits <0> n1;
}
class Enc_1ef990 : OpcodeHexagon {
bits <2> Pv4;
@@ -772,10 +759,7 @@ class Enc_3694bd : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <3> Ns8;
let Inst{18-16} = Ns8{2-0};
- bits <5> n1;
- let Inst{29-29} = n1{4-4};
- let Inst{26-25} = n1{3-2};
- let Inst{23-22} = n1{1-0};
+ bits <0> n1;
}
class Enc_372c9d : OpcodeHexagon {
bits <2> Pv4;
@@ -820,10 +804,7 @@ class Enc_3a2484 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <4> n1;
- let Inst{28-28} = n1{3-3};
- let Inst{24-23} = n1{2-1};
- let Inst{13-13} = n1{0-0};
+ bits <0> n1;
}
class Enc_3a3d62 : OpcodeHexagon {
bits <5> Rs32;
@@ -883,10 +864,7 @@ class Enc_3e3989 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <6> n1;
- let Inst{28-28} = n1{5-5};
- let Inst{25-22} = n1{4-1};
- let Inst{8-8} = n1{0-0};
+ bits <0> n1;
}
class Enc_3f97c8 : OpcodeHexagon {
bits <6> Ii;
@@ -916,9 +894,7 @@ class Enc_405228 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <3> n1;
- let Inst{28-28} = n1{2-2};
- let Inst{24-23} = n1{1-0};
+ bits <0> n1;
}
class Enc_412ff0 : OpcodeHexagon {
bits <5> Rss32;
@@ -1046,9 +1022,7 @@ class Enc_4aca3a : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <3> Ns8;
let Inst{18-16} = Ns8{2-0};
- bits <3> n1;
- let Inst{29-29} = n1{2-2};
- let Inst{26-25} = n1{1-0};
+ bits <0> n1;
}
class Enc_4b39e4 : OpcodeHexagon {
bits <3> Ii;
@@ -1265,11 +1239,7 @@ class Enc_5a18b3 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <3> Ns8;
let Inst{18-16} = Ns8{2-0};
- bits <5> n1;
- let Inst{29-29} = n1{4-4};
- let Inst{26-25} = n1{3-2};
- let Inst{22-22} = n1{1-1};
- let Inst{13-13} = n1{0-0};
+ bits <0> n1;
}
class Enc_5ab2be : OpcodeHexagon {
bits <5> Rs32;
@@ -1445,11 +1415,7 @@ class Enc_6413b6 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <3> Ns8;
let Inst{18-16} = Ns8{2-0};
- bits <5> n1;
- let Inst{29-29} = n1{4-4};
- let Inst{26-25} = n1{3-2};
- let Inst{23-23} = n1{1-1};
- let Inst{13-13} = n1{0-0};
+ bits <0> n1;
}
class Enc_645d54 : OpcodeHexagon {
bits <2> Ii;
@@ -1490,9 +1456,7 @@ class Enc_668704 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{25-22} = n1{3-0};
+ bits <0> n1;
}
class Enc_66bce1 : OpcodeHexagon {
bits <11> Ii;
@@ -1650,9 +1614,7 @@ class Enc_736575 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <4> n1;
- let Inst{28-28} = n1{3-3};
- let Inst{25-23} = n1{2-0};
+ bits <0> n1;
}
class Enc_74aef2 : OpcodeHexagon {
bits <4> Ii;
@@ -1718,8 +1680,7 @@ class Enc_79b8c8 : OpcodeHexagon {
class Enc_7a0ea6 : OpcodeHexagon {
bits <4> Rd16;
let Inst{3-0} = Rd16{3-0};
- bits <1> n1;
- let Inst{9-9} = n1{0-0};
+ bits <0> n1;
}
class Enc_7b523d : OpcodeHexagon {
bits <5> Vu32;
@@ -1805,10 +1766,7 @@ class Enc_800e04 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <6> n1;
- let Inst{28-28} = n1{5-5};
- let Inst{25-22} = n1{4-1};
- let Inst{13-13} = n1{0-0};
+ bits <0> n1;
}
class Enc_80296d : OpcodeHexagon {
bits <5> Rs32;
@@ -2067,10 +2025,7 @@ class Enc_8e583a : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{25-23} = n1{3-1};
- let Inst{13-13} = n1{0-0};
+ bits <0> n1;
}
class Enc_8f7633 : OpcodeHexagon {
bits <5> Rs32;
@@ -2361,10 +2316,7 @@ class Enc_a42857 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{24-22} = n1{3-1};
- let Inst{8-8} = n1{0-0};
+ bits <0> n1;
}
class Enc_a4ef14 : OpcodeHexagon {
bits <5> Rd32;
@@ -2413,11 +2365,7 @@ class Enc_a6853f : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <3> Ns8;
let Inst{18-16} = Ns8{2-0};
- bits <6> n1;
- let Inst{29-29} = n1{5-5};
- let Inst{26-25} = n1{4-3};
- let Inst{23-22} = n1{2-1};
- let Inst{13-13} = n1{0-0};
+ bits <0> n1;
}
class Enc_a6ce9c : OpcodeHexagon {
bits <6> Ii;
@@ -2593,10 +2541,7 @@ class Enc_b1e1fb : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <5> n1;
- let Inst{28-28} = n1{4-4};
- let Inst{25-23} = n1{3-1};
- let Inst{8-8} = n1{0-0};
+ bits <0> n1;
}
class Enc_b388cf : OpcodeHexagon {
bits <5> Ii;
@@ -2661,10 +2606,7 @@ class Enc_b78edd : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <4> n1;
- let Inst{28-28} = n1{3-3};
- let Inst{24-23} = n1{2-1};
- let Inst{8-8} = n1{0-0};
+ bits <0> n1;
}
class Enc_b7fad3 : OpcodeHexagon {
bits <2> Pv4;
@@ -2715,11 +2657,7 @@ class Enc_b909d2 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <7> n1;
- let Inst{28-28} = n1{6-6};
- let Inst{25-22} = n1{5-2};
- let Inst{13-13} = n1{1-1};
- let Inst{8-8} = n1{0-0};
+ bits <0> n1;
}
class Enc_b91167 : OpcodeHexagon {
bits <2> Ii;
@@ -3335,10 +3273,7 @@ class Enc_e90a15 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <3> Ns8;
let Inst{18-16} = Ns8{2-0};
- bits <4> n1;
- let Inst{29-29} = n1{3-3};
- let Inst{26-25} = n1{2-1};
- let Inst{22-22} = n1{0-0};
+ bits <0> n1;
}
class Enc_e957fb : OpcodeHexagon {
bits <12> Ii;
@@ -3417,8 +3352,7 @@ class Enc_ee5ed0 : OpcodeHexagon {
let Inst{7-4} = Rs16{3-0};
bits <4> Rd16;
let Inst{3-0} = Rd16{3-0};
- bits <2> n1;
- let Inst{9-8} = n1{1-0};
+ bits <0> n1;
}
class Enc_ef601b : OpcodeHexagon {
bits <4> Ii;
@@ -3531,11 +3465,7 @@ class Enc_f6fe0b : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <4> Rs16;
let Inst{19-16} = Rs16{3-0};
- bits <6> n1;
- let Inst{28-28} = n1{5-5};
- let Inst{24-22} = n1{4-2};
- let Inst{13-13} = n1{1-1};
- let Inst{8-8} = n1{0-0};
+ bits <0> n1;
}
class Enc_f7430e : OpcodeHexagon {
bits <4> Ii;
@@ -3574,10 +3504,7 @@ class Enc_f7ea77 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <3> Ns8;
let Inst{18-16} = Ns8{2-0};
- bits <4> n1;
- let Inst{29-29} = n1{3-3};
- let Inst{26-25} = n1{2-1};
- let Inst{13-13} = n1{0-0};
+ bits <0> n1;
}
class Enc_f82302 : OpcodeHexagon {
bits <11> Ii;
@@ -3585,10 +3512,7 @@ class Enc_f82302 : OpcodeHexagon {
let Inst{7-1} = Ii{8-2};
bits <3> Ns8;
let Inst{18-16} = Ns8{2-0};
- bits <4> n1;
- let Inst{29-29} = n1{3-3};
- let Inst{26-25} = n1{2-1};
- let Inst{23-23} = n1{0-0};
+ bits <0> n1;
}
class Enc_f82eaf : OpcodeHexagon {
bits <8> Ii;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 64bc5ca134c8..45d194e944fb 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -117,9 +117,10 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768;
// Pin the vtable to this file.
void HexagonInstrInfo::anchor() {}
-HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
- : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
- Subtarget(ST) {}
+HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST)
+ : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN,
+ Hexagon::ADJCALLSTACKUP),
+ Subtarget(ST) {}
namespace llvm {
namespace HexagonFUnits {
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 086cb1fdd8ac..c17e5277ae2e 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -45,7 +45,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
virtual void anchor();
public:
- explicit HexagonInstrInfo(HexagonSubtarget &ST);
+ explicit HexagonInstrInfo(const HexagonSubtarget &ST);
/// TargetInstrInfo overrides.
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 72575f2560a3..1057b88530f4 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -42,6 +42,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/RuntimeLibcalls.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -104,9 +105,6 @@ static cl::opt<bool> HexagonVolatileMemcpy(
static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
-static const char *HexagonVolatileMemcpyName
- = "hexagon_memcpy_forward_vp4cp4n2";
-
namespace {
class HexagonLoopIdiomRecognize {
@@ -2246,6 +2244,11 @@ CleanupAndExit:
Type *PtrTy = PointerType::get(Ctx, 0);
Type *VoidTy = Type::getVoidTy(Ctx);
Module *M = Func->getParent();
+
+ // FIXME: This should check if the call is supported
+ StringRef HexagonVolatileMemcpyName =
+ RTLIB::RuntimeLibcallsInfo::getLibcallImplName(
+ RTLIB::impl_hexagon_memcpy_forward_vp4cp4n2);
FunctionCallee Fn = M->getOrInsertFunction(
HexagonVolatileMemcpyName, VoidTy, PtrTy, PtrTy, Int32Ty);
diff --git a/llvm/lib/Target/Hexagon/HexagonOperands.td b/llvm/lib/Target/Hexagon/HexagonOperands.td
index 5134626c65c7..df5d32c13a73 100644
--- a/llvm/lib/Target/Hexagon/HexagonOperands.td
+++ b/llvm/lib/Target/Hexagon/HexagonOperands.td
@@ -27,9 +27,15 @@ def u9_0ImmPred : PatLeaf<(i32 imm), [{
def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; let RenderMethod = "addImmOperands"; }
def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
-def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
+def n1Const : Operand<i32> {
+ let ParserMatchClass = n1ConstOperand;
+ let DecoderMethod = "n1ConstDecoder";
+}
def sgp10ConstOperand : AsmOperandClass { let Name = "sgp10Const"; }
-def sgp10Const : Operand<i32> { let ParserMatchClass = sgp10ConstOperand; }
+def sgp10Const : Operand<i32> {
+ let ParserMatchClass = sgp10ConstOperand;
+ let DecoderMethod = "sgp10ConstDecoder";
+}
def bblabel : Operand<i32>;
def bbl : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">;
diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
index c24700b89634..9cd0636306b1 100644
--- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
+++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -47,34 +47,100 @@ LLVMInitializeLanaiDisassembler() {
LanaiDisassembler::LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
: MCDisassembler(STI, Ctx) {}
-// Forward declare because the autogenerated code will reference this.
-// Definition is further down.
-static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
+// clang-format off
+static const unsigned GPRDecoderTable[] = {
+ Lanai::R0, Lanai::R1, Lanai::PC, Lanai::R3, Lanai::SP, Lanai::FP,
+ Lanai::R6, Lanai::R7, Lanai::RV, Lanai::R9, Lanai::RR1, Lanai::RR2,
+ Lanai::R12, Lanai::R13, Lanai::R14, Lanai::RCA, Lanai::R16, Lanai::R17,
+ Lanai::R18, Lanai::R19, Lanai::R20, Lanai::R21, Lanai::R22, Lanai::R23,
+ Lanai::R24, Lanai::R25, Lanai::R26, Lanai::R27, Lanai::R28, Lanai::R29,
+ Lanai::R30, Lanai::R31
+};
+// clang-format on
+
+DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const MCDisassembler * /*Decoder*/) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = GPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ // RI memory values encoded using 23 bits:
+ // 5 bit register, 16 bit constant
+ unsigned Register = (Insn >> 18) & 0x1f;
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+ unsigned Offset = (Insn & 0xffff);
+ Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+ return MCDisassembler::Success;
+}
static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ // RR memory values encoded using 20 bits:
+ // 5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ
+ unsigned Register = (Insn >> 15) & 0x1f;
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+ Register = (Insn >> 10) & 0x1f;
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+
+ return MCDisassembler::Success;
+}
static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ // RI memory values encoded using 17 bits:
+ // 5 bit register, 10 bit constant
+ unsigned Register = (Insn >> 12) & 0x1f;
+ Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+ unsigned Offset = (Insn & 0x3ff);
+ Inst.addOperand(MCOperand::createImm(SignExtend32<10>(Offset)));
-static DecodeStatus decodeBranch(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
+ return MCDisassembler::Success;
+}
-static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
+static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const MCDisassembler *Decoder) {
+ return Decoder->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset,
+ Width, /*InstSize=*/0);
+}
+
+static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address,
+ const MCDisassembler *Decoder) {
+ if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI,
+ Decoder))
+ MI.addOperand(MCOperand::createImm(Insn));
+ return MCDisassembler::Success;
+}
static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ unsigned Offset = (Insn & 0xffff);
+ Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ if (Val >= LPCC::UNKNOWN)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createImm(Val));
+ return MCDisassembler::Success;
+}
#include "LanaiGenDisassemblerTables.inc"
@@ -157,95 +223,3 @@ LanaiDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
return MCDisassembler::Fail;
}
-
-static const unsigned GPRDecoderTable[] = {
- Lanai::R0, Lanai::R1, Lanai::PC, Lanai::R3, Lanai::SP, Lanai::FP,
- Lanai::R6, Lanai::R7, Lanai::RV, Lanai::R9, Lanai::RR1, Lanai::RR2,
- Lanai::R12, Lanai::R13, Lanai::R14, Lanai::RCA, Lanai::R16, Lanai::R17,
- Lanai::R18, Lanai::R19, Lanai::R20, Lanai::R21, Lanai::R22, Lanai::R23,
- Lanai::R24, Lanai::R25, Lanai::R26, Lanai::R27, Lanai::R28, Lanai::R29,
- Lanai::R30, Lanai::R31};
-
-DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t /*Address*/,
- const MCDisassembler * /*Decoder*/) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- unsigned Reg = GPRDecoderTable[RegNo];
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- // RI memory values encoded using 23 bits:
- // 5 bit register, 16 bit constant
- unsigned Register = (Insn >> 18) & 0x1f;
- Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
- unsigned Offset = (Insn & 0xffff);
- Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
-
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- // RR memory values encoded using 20 bits:
- // 5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ
- unsigned Register = (Insn >> 15) & 0x1f;
- Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
- Register = (Insn >> 10) & 0x1f;
- Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
-
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- // RI memory values encoded using 17 bits:
- // 5 bit register, 10 bit constant
- unsigned Register = (Insn >> 12) & 0x1f;
- Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
- unsigned Offset = (Insn & 0x3ff);
- Inst.addOperand(MCOperand::createImm(SignExtend32<10>(Offset)));
-
- return MCDisassembler::Success;
-}
-
-static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch,
- uint64_t Address, uint64_t Offset,
- uint64_t Width, MCInst &MI,
- const MCDisassembler *Decoder) {
- return Decoder->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset,
- Width, /*InstSize=*/0);
-}
-
-static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI,
- Decoder))
- MI.addOperand(MCOperand::createImm(Insn));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Offset = (Insn & 0xffff);
- Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
-
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (Val >= LPCC::UNKNOWN)
- return MCDisassembler::Fail;
- Inst.addOperand(MCOperand::createImm(Val));
- return MCDisassembler::Success;
-} \ No newline at end of file
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index 4ca97da16cde..02ed1001cd0d 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -13,6 +13,7 @@
#include "LanaiInstrInfo.h"
#include "LanaiAluCode.h"
#include "LanaiCondCode.h"
+#include "LanaiSubtarget.h"
#include "MCTargetDesc/LanaiBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
@@ -25,8 +26,8 @@ using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "LanaiGenInstrInfo.inc"
-LanaiInstrInfo::LanaiInstrInfo()
- : LanaiGenInstrInfo(Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP),
+LanaiInstrInfo::LanaiInstrInfo(const LanaiSubtarget &STI)
+ : LanaiGenInstrInfo(STI, Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP),
RegisterInfo() {}
void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
index 07b1e87dc8b2..d98276243dc3 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -22,11 +22,13 @@
namespace llvm {
+class LanaiSubtarget;
+
class LanaiInstrInfo : public LanaiGenInstrInfo {
const LanaiRegisterInfo RegisterInfo;
public:
- LanaiInstrInfo();
+ LanaiInstrInfo(const LanaiSubtarget &STI);
// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.td b/llvm/lib/Target/Lanai/LanaiInstrInfo.td
index 1d968fa391c2..e0cd79ca22ff 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.td
@@ -212,7 +212,6 @@ def MemImmAsmOperand : AsmOperandClass {
let ParserMethod = "parseMemoryOperand";
}
def MEMi : Operand<i32> {
- let MIOperandInfo = (ops i32lo21:$offset);
let ParserMatchClass = MemImmAsmOperand;
let PrintMethod = "printMemImmOperand";
}
@@ -402,7 +401,7 @@ def : Pat<(LanaiSubbF GPR:$Rs1, i32lo16z:$imm),
def : Pat<(LanaiSubbF GPR:$Rs1, i32hi16:$imm),
(SUBB_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
-def : InstAlias<"mov $src, $dst", (ADD_R GPR:$dst, GPR:$src, R0, 0)>;
+def : InstAlias<"mov $src, $dst", (ADD_R GPR:$dst, GPR:$src, R0, (pred 0))>;
let isAsCheapAsAMove = 1, Rs1 = R0.Num, isCodeGenOnly = 1, H = 1, F = 0,
isReMaterializable = 1 in
diff --git a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
index 24aa8553279f..f99e88373edf 100644
--- a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
+++ b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -40,5 +40,5 @@ LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
CodeModel::Model /*CodeModel*/,
CodeGenOptLevel /*OptLevel*/)
: LanaiGenSubtargetInfo(TargetTriple, Cpu, /*TuneCPU*/ Cpu, FeatureString),
- FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)),
- TLInfo(TM, *this) {}
+ InstrInfo(initializeSubtargetDependencies(Cpu, FeatureString)),
+ FrameLowering(*this), TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/Lanai/LanaiSubtarget.h b/llvm/lib/Target/Lanai/LanaiSubtarget.h
index 0a229063ab7b..233c89e881d5 100644
--- a/llvm/lib/Target/Lanai/LanaiSubtarget.h
+++ b/llvm/lib/Target/Lanai/LanaiSubtarget.h
@@ -64,8 +64,8 @@ public:
}
private:
- LanaiFrameLowering FrameLowering;
LanaiInstrInfo InstrInfo;
+ LanaiFrameLowering FrameLowering;
LanaiTargetLowering TLInfo;
LanaiSelectionDAGInfo TSInfo;
};
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 39948b31fb9b..6497ff999f6f 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -39,7 +39,7 @@ def IsLA32
"LA32 Basic Integer and Privilege Instruction Set">;
defvar LA32 = DefaultMode;
-def LA64 : HwMode<"+64bit", [IsLA64]>;
+def LA64 : HwMode<[IsLA64]>;
// Single Precision floating point
def FeatureBasicF
diff --git a/llvm/lib/Target/LoongArch/LoongArchCallingConv.td b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td
index 9844163163a5..7dcf65ce2b82 100644
--- a/llvm/lib/Target/LoongArch/LoongArchCallingConv.td
+++ b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td
@@ -21,3 +21,7 @@ def CSR_ILP32D_LP64D
// Needed for implementation of LoongArchRegisterInfo::getNoPreservedMask()
def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_MostRegs : CalleeSavedRegs<(add CSR_ILP32S_LP64S,
+ (sequence "R%u", 4, 11),
+ (sequence "R%u", 16, 19))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index 36c3011be2b9..c45975431d83 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -10,6 +10,9 @@
//
//===----------------------------------------------------------------------===//
+def NotBoolXor : PatFrags<(ops node:$val),
+ [(xor node:$val, -1), (xor node:$val, 1)]>;
+
//===----------------------------------------------------------------------===//
// LoongArch specific DAG Nodes.
//===----------------------------------------------------------------------===//
@@ -22,6 +25,9 @@ def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
def SDT_LoongArchFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
def SDT_LoongArchFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
+// ISD::BRCOND is custom-lowered to LoongArchISD::BRCOND for floating-point
+// comparisons to prevent recursive lowering.
+def loongarch_brcond : SDNode<"LoongArchISD::BRCOND", SDTBrcond, [SDNPHasChain]>;
def loongarch_movgr2fr_w_la64
: SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>;
def loongarch_movfr2gr_s_la64
@@ -208,16 +214,18 @@ def : PatFPSetcc<SETUO, FCMP_CUN_S, FPR32>;
def : PatFPSetcc<SETLT, FCMP_CLT_S, FPR32>;
multiclass PatFPBrcond<CondCode cc, LAInst CmpInst, RegisterClass RegTy> {
- def : Pat<(brcond (xor (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), -1),
- bb:$imm21),
+ def : Pat<(loongarch_brcond (NotBoolXor (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc))),
+ bb:$imm21),
(BCEQZ (CmpInst RegTy:$fj, RegTy:$fk), bb:$imm21)>;
- def : Pat<(brcond (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), bb:$imm21),
+ def : Pat<(loongarch_brcond (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), bb:$imm21),
(BCNEZ (CmpInst RegTy:$fj, RegTy:$fk), bb:$imm21)>;
}
defm : PatFPBrcond<SETOEQ, FCMP_CEQ_S, FPR32>;
+defm : PatFPBrcond<SETEQ , FCMP_CEQ_S, FPR32>;
defm : PatFPBrcond<SETOLT, FCMP_CLT_S, FPR32>;
defm : PatFPBrcond<SETOLE, FCMP_CLE_S, FPR32>;
+defm : PatFPBrcond<SETLE, FCMP_CLE_S, FPR32>;
defm : PatFPBrcond<SETONE, FCMP_CNE_S, FPR32>;
defm : PatFPBrcond<SETO, FCMP_COR_S, FPR32>;
defm : PatFPBrcond<SETUEQ, FCMP_CUEQ_S, FPR32>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 616640152c8d..965ad8a0a35c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -184,8 +184,10 @@ def : PatFPSetcc<SETUO, FCMP_CUN_D, FPR64>;
def : PatFPSetcc<SETLT, FCMP_CLT_D, FPR64>;
defm : PatFPBrcond<SETOEQ, FCMP_CEQ_D, FPR64>;
+defm : PatFPBrcond<SETEQ, FCMP_CEQ_D, FPR64>;
defm : PatFPBrcond<SETOLT, FCMP_CLT_D, FPR64>;
defm : PatFPBrcond<SETOLE, FCMP_CLE_D, FPR64>;
+defm : PatFPBrcond<SETLE, FCMP_CLE_D, FPR64>;
defm : PatFPBrcond<SETONE, FCMP_CNE_D, FPR64>;
defm : PatFPBrcond<SETO, FCMP_COR_D, FPR64>;
defm : PatFPBrcond<SETUEQ, FCMP_CUEQ_D, FPR64>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
index 71d0263fe376..07e722b9a659 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
@@ -114,7 +114,7 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
unsigned SplatBitSize;
bool HasAnyUndefs;
unsigned Op;
- EVT ViaVecTy;
+ EVT ResTy = BVN->getValueType(0);
bool Is128Vec = BVN->getValueType(0).is128BitVector();
bool Is256Vec = BVN->getValueType(0).is256BitVector();
@@ -129,28 +129,25 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
break;
case 8:
Op = Is256Vec ? LoongArch::PseudoXVREPLI_B : LoongArch::PseudoVREPLI_B;
- ViaVecTy = Is256Vec ? MVT::v32i8 : MVT::v16i8;
break;
case 16:
Op = Is256Vec ? LoongArch::PseudoXVREPLI_H : LoongArch::PseudoVREPLI_H;
- ViaVecTy = Is256Vec ? MVT::v16i16 : MVT::v8i16;
break;
case 32:
Op = Is256Vec ? LoongArch::PseudoXVREPLI_W : LoongArch::PseudoVREPLI_W;
- ViaVecTy = Is256Vec ? MVT::v8i32 : MVT::v4i32;
break;
case 64:
Op = Is256Vec ? LoongArch::PseudoXVREPLI_D : LoongArch::PseudoVREPLI_D;
- ViaVecTy = Is256Vec ? MVT::v4i64 : MVT::v2i64;
break;
}
SDNode *Res;
// If we have a signed 10 bit integer, we can splat it directly.
if (SplatValue.isSignedIntN(10)) {
- SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
- ViaVecTy.getVectorElementType());
- Res = CurDAG->getMachineNode(Op, DL, ViaVecTy, Imm);
+ EVT EleType = ResTy.getVectorElementType();
+ APInt Val = SplatValue.sextOrTrunc(EleType.getSizeInBits());
+ SDValue Imm = CurDAG->getTargetConstant(Val, DL, EleType);
+ Res = CurDAG->getMachineNode(Op, DL, ResTy, Imm);
ReplaceNode(Node, Res);
return;
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 5b2d185594f4..634914d3b3fd 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -127,6 +127,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, GRLenVT, Expand);
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::SELECT_CC, GRLenVT, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, GRLenVT, Expand);
@@ -340,6 +341,14 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
{MVT::v16i8, MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v8i16, MVT::v4i16,
MVT::v2i16, MVT::v4i32, MVT::v2i32, MVT::v2i64}) {
setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
}
}
@@ -377,6 +386,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::ABDS, VT, Legal);
setOperationAction(ISD::ABDU, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
}
for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32})
setOperationAction(ISD::BITREVERSE, VT, Custom);
@@ -413,6 +423,11 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::BITCAST);
}
+ // Set DAG combine for 'LASX' feature.
+
+ if (Subtarget.hasExtLASX())
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+
// Compute derived properties from the register classes.
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -514,6 +529,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
return lowerPREFETCH(Op, DAG);
case ISD::SELECT:
return lowerSELECT(Op, DAG);
+ case ISD::BRCOND:
+ return lowerBRCOND(Op, DAG);
case ISD::FP_TO_FP16:
return lowerFP_TO_FP16(Op, DAG);
case ISD::FP16_TO_FP:
@@ -522,10 +539,109 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
return lowerFP_TO_BF16(Op, DAG);
case ISD::BF16_TO_FP:
return lowerBF16_TO_FP(Op, DAG);
+ case ISD::VECREDUCE_ADD:
+ return lowerVECREDUCE_ADD(Op, DAG);
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ return lowerVECREDUCE(Op, DAG);
}
return SDValue();
}
+// Lower vecreduce_add using vhaddw instructions.
+// For Example:
+// call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
+// can be lowered to:
+// VHADDW_D_W vr0, vr0, vr0
+// VHADDW_Q_D vr0, vr0, vr0
+// VPICKVE2GR_D a0, vr0, 0
+// ADDI_W a0, a0, 0
+SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ SDLoc DL(Op);
+ MVT OpVT = Op.getSimpleValueType();
+ SDValue Val = Op.getOperand(0);
+
+ unsigned NumEles = Val.getSimpleValueType().getVectorNumElements();
+ unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits();
+
+ unsigned LegalVecSize = 128;
+ bool isLASX256Vector =
+ Subtarget.hasExtLASX() && Val.getValueSizeInBits() == 256;
+
+ // Ensure operand type legal or enable it legal.
+ while (!isTypeLegal(Val.getSimpleValueType())) {
+ Val = DAG.WidenVector(Val, DL);
+ }
+
+ // NumEles is designed for iterations count, v4i32 for LSX
+ // and v8i32 for LASX should have the same count.
+ if (isLASX256Vector) {
+ NumEles /= 2;
+ LegalVecSize = 256;
+ }
+
+ for (unsigned i = 1; i < NumEles; i *= 2, EleBits *= 2) {
+ MVT IntTy = MVT::getIntegerVT(EleBits);
+ MVT VecTy = MVT::getVectorVT(IntTy, LegalVecSize / EleBits);
+ Val = DAG.getNode(LoongArchISD::VHADDW, DL, VecTy, Val, Val);
+ }
+
+ if (isLASX256Vector) {
+ SDValue Tmp = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, Val,
+ DAG.getConstant(2, DL, MVT::i64));
+ Val = DAG.getNode(ISD::ADD, DL, MVT::v4i64, Tmp, Val);
+ }
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val,
+ DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
+}
+
+// Lower vecreduce_and/or/xor/[s/u]max/[s/u]min.
+// For Example:
+// call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
+// can be lowered to:
+// VBSRL_V vr1, vr0, 8
+// VMAX_W vr0, vr1, vr0
+// VBSRL_V vr1, vr0, 4
+// VMAX_W vr0, vr1, vr0
+// VPICKVE2GR_W a0, vr0, 0
+// For 256 bit vector, it is illegal and will be spilt into
+// two 128 bit vector by default then processed by this.
+SDValue LoongArchTargetLowering::lowerVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+
+ MVT OpVT = Op.getSimpleValueType();
+ SDValue Val = Op.getOperand(0);
+
+ unsigned NumEles = Val.getSimpleValueType().getVectorNumElements();
+ unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits();
+
+ // Ensure operand type legal or enable it legal.
+ while (!isTypeLegal(Val.getSimpleValueType())) {
+ Val = DAG.WidenVector(Val, DL);
+ }
+
+ unsigned Opcode = ISD::getVecReduceBaseOpcode(Op.getOpcode());
+ MVT VecTy = Val.getSimpleValueType();
+
+ for (int i = NumEles; i > 1; i /= 2) {
+ SDValue ShiftAmt = DAG.getConstant(i * EleBits / 16, DL, MVT::i64);
+ SDValue Tmp = DAG.getNode(LoongArchISD::VBSRL, DL, VecTy, Val, ShiftAmt);
+ Val = DAG.getNode(Opcode, DL, VecTy, Tmp, Val);
+ }
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val,
+ DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
+}
+
SDValue LoongArchTargetLowering::lowerPREFETCH(SDValue Op,
SelectionDAG &DAG) const {
unsigned IsData = Op.getConstantOperandVal(4);
@@ -859,6 +975,35 @@ SDValue LoongArchTargetLowering::lowerSELECT(SDValue Op,
return DAG.getNode(LoongArchISD::SELECT_CC, DL, VT, Ops);
}
+SDValue LoongArchTargetLowering::lowerBRCOND(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue CondV = Op.getOperand(1);
+ SDLoc DL(Op);
+ MVT GRLenVT = Subtarget.getGRLenVT();
+
+ if (CondV.getOpcode() == ISD::SETCC) {
+ if (CondV.getOperand(0).getValueType() == GRLenVT) {
+ SDValue LHS = CondV.getOperand(0);
+ SDValue RHS = CondV.getOperand(1);
+ ISD::CondCode CCVal = cast<CondCodeSDNode>(CondV.getOperand(2))->get();
+
+ translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+ SDValue TargetCC = DAG.getCondCode(CCVal);
+ return DAG.getNode(LoongArchISD::BR_CC, DL, Op.getValueType(),
+ Op.getOperand(0), LHS, RHS, TargetCC,
+ Op.getOperand(2));
+ } else if (CondV.getOperand(0).getValueType().isFloatingPoint()) {
+ return DAG.getNode(LoongArchISD::BRCOND, DL, Op.getValueType(),
+ Op.getOperand(0), CondV, Op.getOperand(2));
+ }
+ }
+
+ return DAG.getNode(LoongArchISD::BR_CC, DL, Op.getValueType(),
+ Op.getOperand(0), CondV, DAG.getConstant(0, DL, GRLenVT),
+ DAG.getCondCode(ISD::SETNE), Op.getOperand(2));
+}
+
SDValue
LoongArchTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
@@ -1031,6 +1176,7 @@ static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget,
const APInt &Zeroable) {
int Size = Mask.size();
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
@@ -1057,7 +1203,7 @@ static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask,
"Illegal integer vector type");
V = DAG.getBitcast(ShiftVT, V);
V = DAG.getNode(Opcode, DL, ShiftVT, V,
- DAG.getConstant(ShiftAmt, DL, MVT::i64));
+ DAG.getConstant(ShiftAmt, DL, Subtarget.getGRLenVT()));
return DAG.getBitcast(VT, V);
}
@@ -1226,10 +1372,10 @@ static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
/// (VBSRL_V $v1, $v1, 8)
/// (VBSLL_V $v0, $v0, 8)
/// (VOR_V $v0, $V0, $v1)
-static SDValue lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL,
- ArrayRef<int> Mask, MVT VT,
- SDValue V1, SDValue V2,
- SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
SDValue Lo = V1, Hi = V2;
int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
@@ -1242,11 +1388,12 @@ static SDValue lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL,
int LoByteShift = 16 - ByteRotation;
int HiByteShift = ByteRotation;
+ MVT GRLenVT = Subtarget.getGRLenVT();
SDValue LoShift = DAG.getNode(LoongArchISD::VBSLL, DL, ByteVT, Lo,
- DAG.getConstant(LoByteShift, DL, MVT::i64));
+ DAG.getConstant(LoByteShift, DL, GRLenVT));
SDValue HiShift = DAG.getNode(LoongArchISD::VBSRL, DL, ByteVT, Hi,
- DAG.getConstant(HiByteShift, DL, MVT::i64));
+ DAG.getConstant(HiByteShift, DL, GRLenVT));
return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LoShift, HiShift));
}
@@ -1351,9 +1498,10 @@ static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
///
/// When undef's appear in the mask they are treated as if they were whatever
/// value is necessary in order to fit the above form.
-static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
int SplatIndex = -1;
for (const auto &M : Mask) {
if (M != -1) {
@@ -1369,7 +1517,7 @@ static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) {
APInt Imm(64, SplatIndex);
return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
- DAG.getConstant(Imm, DL, MVT::i64));
+ DAG.getConstant(Imm, DL, Subtarget.getGRLenVT()));
}
return SDValue();
@@ -1393,9 +1541,10 @@ static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
/// (VSHUF4I_H $v0, $v1, 27)
/// where the 27 comes from:
/// 3 + (2 << 2) + (1 << 4) + (0 << 6)
-static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
unsigned SubVecSize = 4;
if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -1437,13 +1586,15 @@ static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
Imm |= M & 0x3;
}
+ MVT GRLenVT = Subtarget.getGRLenVT();
+
// Return vshuf4i.d
if (VT == MVT::v2f64 || VT == MVT::v2i64)
return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1, V2,
- DAG.getConstant(Imm, DL, MVT::i64));
+ DAG.getConstant(Imm, DL, GRLenVT));
return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1,
- DAG.getConstant(Imm, DL, MVT::i64));
+ DAG.getConstant(Imm, DL, GRLenVT));
}
/// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
@@ -1723,7 +1874,8 @@ static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine breaks down the specific type of 128-bit shuffle and
/// dispatches to the lowering routines accordingly.
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
- SDValue V1, SDValue V2, SelectionDAG &DAG) {
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 ||
VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 ||
VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) &&
@@ -1741,9 +1893,11 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
SDValue Result;
// TODO: Add more comparison patterns.
if (V2.isUndef()) {
- if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG)))
+ if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG,
+ Subtarget)))
return Result;
- if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
+ if ((Result =
+ lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget)))
return Result;
// TODO: This comment may be enabled in the future to better match the
@@ -1766,15 +1920,17 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
return Result;
if ((VT.SimpleTy == MVT::v2i64 || VT.SimpleTy == MVT::v2f64) &&
- (Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
+ (Result =
+ lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget)))
return Result;
if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG,
Zeroable)))
return Result;
- if ((Result =
- lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Zeroable)))
+ if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget,
+ Zeroable)))
return Result;
- if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG)))
+ if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG,
+ Subtarget)))
return Result;
if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG))
return NewShuffle;
@@ -1791,10 +1947,10 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
///
/// When undef's appear in the mask they are treated as if they were whatever
/// value is necessary in order to fit the above form.
-static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
- ArrayRef<int> Mask, MVT VT,
- SDValue V1, SDValue V2,
- SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
int SplatIndex = -1;
for (const auto &M : Mask) {
if (M != -1) {
@@ -1816,21 +1972,64 @@ static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
0)) {
APInt Imm(64, SplatIndex);
return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
- DAG.getConstant(Imm, DL, MVT::i64));
+ DAG.getConstant(Imm, DL, Subtarget.getGRLenVT()));
}
return SDValue();
}
/// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
// When the size is less than or equal to 4, lower cost instructions may be
// used.
if (Mask.size() <= 4)
return SDValue();
- return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
+ return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPERM (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ // LoongArch LASX only have XVPERM_W.
+ if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32))
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfSize = NumElts / 2;
+ bool FrontLo = true, FrontHi = true;
+ bool BackLo = true, BackHi = true;
+
+ auto inRange = [](int val, int low, int high) {
+ return (val == -1) || (val >= low && val < high);
+ };
+
+ for (unsigned i = 0; i < HalfSize; ++i) {
+ int Fronti = Mask[i];
+ int Backi = Mask[i + HalfSize];
+
+ FrontLo &= inRange(Fronti, 0, HalfSize);
+ FrontHi &= inRange(Fronti, HalfSize, NumElts);
+ BackLo &= inRange(Backi, 0, HalfSize);
+ BackHi &= inRange(Backi, HalfSize, NumElts);
+ }
+
+ // If both the lower and upper 128-bit parts access only one half of the
+ // vector (either lower or upper), avoid using xvperm.w. The latency of
+ // xvperm.w(3) is higher than using xvshuf(1) and xvori(1).
+ if ((FrontLo || FrontHi) && (BackLo || BackHi))
+ return SDValue();
+
+ SmallVector<SDValue, 8> Masks;
+ for (unsigned i = 0; i < NumElts; ++i)
+ Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(MVT::i64)
+ : DAG.getConstant(Mask[i], DL, MVT::i64));
+ SDValue MaskVec = DAG.getBuildVector(MVT::v8i32, DL, Masks);
+
+ return DAG.getNode(LoongArchISD::XVPERM, DL, VT, V1, MaskVec);
}
/// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
@@ -2060,15 +2259,15 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
/// cases need to be converted to it for processing.
///
/// This function may modify V1, V2 and Mask
-static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
- MutableArrayRef<int> Mask, MVT VT,
- SDValue &V1, SDValue &V2,
- SelectionDAG &DAG) {
+static void canonicalizeShuffleVectorByLane(
+ const SDLoc &DL, MutableArrayRef<int> Mask, MVT VT, SDValue &V1,
+ SDValue &V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) {
enum HalfMaskType { HighLaneTy, LowLaneTy, None };
int MaskSize = Mask.size();
int HalfSize = Mask.size() / 2;
+ MVT GRLenVT = Subtarget.getGRLenVT();
HalfMaskType preMask = None, postMask = None;
@@ -2106,13 +2305,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
if (preMask == LowLaneTy && postMask == HighLaneTy) {
V1 = DAG.getBitcast(MVT::v4i64, V1);
V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
- DAG.getConstant(0b01001110, DL, MVT::i64));
+ DAG.getConstant(0b01001110, DL, GRLenVT));
V1 = DAG.getBitcast(VT, V1);
if (!V2.isUndef()) {
V2 = DAG.getBitcast(MVT::v4i64, V2);
V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
- DAG.getConstant(0b01001110, DL, MVT::i64));
+ DAG.getConstant(0b01001110, DL, GRLenVT));
V2 = DAG.getBitcast(VT, V2);
}
@@ -2125,13 +2324,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
} else if (preMask == LowLaneTy && postMask == LowLaneTy) {
V1 = DAG.getBitcast(MVT::v4i64, V1);
V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
- DAG.getConstant(0b11101110, DL, MVT::i64));
+ DAG.getConstant(0b11101110, DL, GRLenVT));
V1 = DAG.getBitcast(VT, V1);
if (!V2.isUndef()) {
V2 = DAG.getBitcast(MVT::v4i64, V2);
V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
- DAG.getConstant(0b11101110, DL, MVT::i64));
+ DAG.getConstant(0b11101110, DL, GRLenVT));
V2 = DAG.getBitcast(VT, V2);
}
@@ -2141,13 +2340,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
} else if (preMask == HighLaneTy && postMask == HighLaneTy) {
V1 = DAG.getBitcast(MVT::v4i64, V1);
V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
- DAG.getConstant(0b01000100, DL, MVT::i64));
+ DAG.getConstant(0b01000100, DL, GRLenVT));
V1 = DAG.getBitcast(VT, V1);
if (!V2.isUndef()) {
V2 = DAG.getBitcast(MVT::v4i64, V2);
V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
- DAG.getConstant(0b01000100, DL, MVT::i64));
+ DAG.getConstant(0b01000100, DL, GRLenVT));
V2 = DAG.getBitcast(VT, V2);
}
@@ -2209,7 +2408,8 @@ static SDValue lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(const SDLoc &DL,
/// This routine breaks down the specific type of 256-bit shuffle and
/// dispatches to the lowering routines accordingly.
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
- SDValue V1, SDValue V2, SelectionDAG &DAG) {
+ SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 ||
VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 ||
VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) &&
@@ -2223,7 +2423,7 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
// canonicalize non cross-lane shuffle vector
SmallVector<int> NewMask(Mask);
- canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG);
+ canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget);
APInt KnownUndef, KnownZero;
computeZeroableShuffleElements(NewMask, V1, V2, KnownUndef, KnownZero);
@@ -2232,9 +2432,13 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
SDValue Result;
// TODO: Add more comparison patterns.
if (V2.isUndef()) {
- if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG)))
+ if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG,
+ Subtarget)))
+ return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG,
+ Subtarget)))
return Result;
- if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
+ if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG)))
return Result;
if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
V1, V2, DAG)))
@@ -2259,10 +2463,11 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return Result;
if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
return Result;
- if ((Result =
- lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG, Zeroable)))
+ if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG,
+ Subtarget, Zeroable)))
return Result;
- if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG)))
+ if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG,
+ Subtarget)))
return Result;
if (SDValue NewShuffle = widenShuffleMask(DL, NewMask, VT, V1, V2, DAG))
return NewShuffle;
@@ -2314,10 +2519,10 @@ SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
- return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+ return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG, Subtarget);
if (VT.is256BitVector())
- return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+ return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG, Subtarget);
return SDValue();
}
@@ -2414,11 +2619,14 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp,
}
// make sure that this load is valid and only has one user.
- if (!IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode()))
+ if (!IsIdeneity || !IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode()))
return SDValue();
- if (IsIdeneity) {
- auto *LN = cast<LoadSDNode>(IdentitySrc);
+ auto *LN = cast<LoadSDNode>(IdentitySrc);
+ auto ExtType = LN->getExtensionType();
+
+ if ((ExtType == ISD::EXTLOAD || ExtType == ISD::NON_EXTLOAD) &&
+ VT.getScalarSizeInBits() == LN->getMemoryVT().getScalarSizeInBits()) {
SDVTList Tys =
LN->isIndexed()
? DAG.getVTList(VT, LN->getBasePtr().getValueType(), MVT::Other)
@@ -2461,6 +2669,16 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
SplatBitSize != 64)
return SDValue();
+ if (SplatBitSize == 64 && !Subtarget.is64Bit()) {
+ // We can only handle 64-bit elements that are within
+ // the signed 32-bit range on 32-bit targets.
+ if (!SplatValue.isSignedIntN(32))
+ return SDValue();
+ if ((Is128Vec && ResTy == MVT::v4i32) ||
+ (Is256Vec && ResTy == MVT::v8i32))
+ return Op;
+ }
+
EVT ViaVecTy;
switch (SplatBitSize) {
@@ -2609,14 +2827,58 @@ SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op,
SDValue
LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
- EVT VecTy = Op->getOperand(0)->getValueType(0);
+ MVT EltVT = Op.getSimpleValueType();
+ SDValue Vec = Op->getOperand(0);
+ EVT VecTy = Vec->getValueType(0);
SDValue Idx = Op->getOperand(1);
- unsigned NumElts = VecTy.getVectorNumElements();
+ SDLoc DL(Op);
+ MVT GRLenVT = Subtarget.getGRLenVT();
+
+ assert(VecTy.is256BitVector() && "Unexpected EXTRACT_VECTOR_ELT vector type");
- if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts)
+ if (isa<ConstantSDNode>(Idx))
return Op;
- return SDValue();
+ switch (VecTy.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected type");
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v4i64:
+ case MVT::v4f64: {
+ // Extract the high half subvector and place it to the low half of a new
+ // vector. It doesn't matter what the high half of the new vector is.
+ EVT HalfTy = VecTy.getHalfNumVectorElementsVT(*DAG.getContext());
+ SDValue VecHi =
+ DAG.getExtractSubvector(DL, HalfTy, Vec, HalfTy.getVectorNumElements());
+ SDValue TmpVec =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecTy, DAG.getUNDEF(VecTy),
+ VecHi, DAG.getConstant(0, DL, GRLenVT));
+
+ // Shuffle the origin Vec and the TmpVec using MaskVec, the lowest element
+ // of MaskVec is Idx, the rest do not matter. ResVec[0] will hold the
+ // desired element.
+ SDValue IdxCp =
+ DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, Idx);
+ SDValue IdxVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f32, IdxCp);
+ SDValue MaskVec =
+ DAG.getBitcast((VecTy == MVT::v4f64) ? MVT::v4i64 : VecTy, IdxVec);
+ SDValue ResVec =
+ DAG.getNode(LoongArchISD::VSHUF, DL, VecTy, MaskVec, TmpVec, Vec);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ResVec,
+ DAG.getConstant(0, DL, GRLenVT));
+ }
+ case MVT::v8i32:
+ case MVT::v8f32: {
+ SDValue SplatIdx = DAG.getSplatBuildVector(MVT::v8i32, DL, Idx);
+ SDValue SplatValue =
+ DAG.getNode(LoongArchISD::XVPERM, DL, VecTy, Vec, SplatIdx);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SplatValue,
+ DAG.getConstant(0, DL, GRLenVT));
+ }
+ }
}
SDValue
@@ -4740,13 +5002,29 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
UseLASX = true;
break;
};
- if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX()))
- return SDValue();
Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
- Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
- SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src);
+ SDValue V;
+ if (!Subtarget.has32S() || !Subtarget.hasExtLASX()) {
+ if (Src.getSimpleValueType() == MVT::v32i8) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Src, DL);
+ Lo = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Lo);
+ Hi = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Hi);
+ Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
+ DAG.getConstant(16, DL, MVT::i8));
+ V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
+ } else if (UseLASX) {
+ return SDValue();
+ }
+ }
+
+ if (!V) {
+ Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+ V = DAG.getNode(Opc, DL, MVT::i64, Src);
+ }
+
EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
V = DAG.getZExtOrTrunc(V, DL, T);
return DAG.getBitcast(VT, V);
@@ -5154,6 +5432,145 @@ static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG,
Src.getOperand(0));
}
+// Perform common combines for BR_CC and SELECT_CC conditions.
+static bool combine_CC(SDValue &LHS, SDValue &RHS, SDValue &CC, const SDLoc &DL,
+ SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) {
+ ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get();
+
+ // As far as arithmetic right shift always saves the sign,
+ // shift can be omitted.
+ // Fold setlt (sra X, N), 0 -> setlt X, 0 and
+ // setge (sra X, N), 0 -> setge X, 0
+ if (isNullConstant(RHS) && (CCVal == ISD::SETGE || CCVal == ISD::SETLT) &&
+ LHS.getOpcode() == ISD::SRA) {
+ LHS = LHS.getOperand(0);
+ return true;
+ }
+
+ if (!ISD::isIntEqualitySetCC(CCVal))
+ return false;
+
+ // Fold ((setlt X, Y), 0, ne) -> (X, Y, lt)
+ // Sometimes the setcc is introduced after br_cc/select_cc has been formed.
+ if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
+ LHS.getOperand(0).getValueType() == Subtarget.getGRLenVT()) {
+ // If we're looking for eq 0 instead of ne 0, we need to invert the
+ // condition.
+ bool Invert = CCVal == ISD::SETEQ;
+ CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+ if (Invert)
+ CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+ CC = DAG.getCondCode(CCVal);
+ return true;
+ }
+
+ // Fold ((srl (and X, 1<<C), C), 0, eq/ne) -> ((shl X, GRLen-1-C), 0, ge/lt)
+ if (isNullConstant(RHS) && LHS.getOpcode() == ISD::SRL && LHS.hasOneUse() &&
+ LHS.getOperand(1).getOpcode() == ISD::Constant) {
+ SDValue LHS0 = LHS.getOperand(0);
+ if (LHS0.getOpcode() == ISD::AND &&
+ LHS0.getOperand(1).getOpcode() == ISD::Constant) {
+ uint64_t Mask = LHS0.getConstantOperandVal(1);
+ uint64_t ShAmt = LHS.getConstantOperandVal(1);
+ if (isPowerOf2_64(Mask) && Log2_64(Mask) == ShAmt) {
+ CCVal = CCVal == ISD::SETEQ ? ISD::SETGE : ISD::SETLT;
+ CC = DAG.getCondCode(CCVal);
+
+ ShAmt = LHS.getValueSizeInBits() - 1 - ShAmt;
+ LHS = LHS0.getOperand(0);
+ if (ShAmt != 0)
+ LHS =
+ DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS0.getOperand(0),
+ DAG.getConstant(ShAmt, DL, LHS.getValueType()));
+ return true;
+ }
+ }
+ }
+
+ // (X, 1, setne) -> (X, 0, seteq) if we can prove X is 0/1.
+ // This can occur when legalizing some floating point comparisons.
+ APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
+ if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
+ CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+ CC = DAG.getCondCode(CCVal);
+ RHS = DAG.getConstant(0, DL, LHS.getValueType());
+ return true;
+ }
+
+ return false;
+}
+
+static SDValue performBR_CCCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const LoongArchSubtarget &Subtarget) {
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ SDValue CC = N->getOperand(3);
+ SDLoc DL(N);
+
+ if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget))
+ return DAG.getNode(LoongArchISD::BR_CC, DL, N->getValueType(0),
+ N->getOperand(0), LHS, RHS, CC, N->getOperand(4));
+
+ return SDValue();
+}
+
+static SDValue performSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const LoongArchSubtarget &Subtarget) {
+ // Transform
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue CC = N->getOperand(2);
+ ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get();
+ SDValue TrueV = N->getOperand(3);
+ SDValue FalseV = N->getOperand(4);
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ // If the True and False values are the same, we don't need a select_cc.
+ if (TrueV == FalseV)
+ return TrueV;
+
+ // (select (x < 0), y, z) -> x >> (GRLEN - 1) & (y - z) + z
+ // (select (x >= 0), y, z) -> x >> (GRLEN - 1) & (z - y) + y
+ if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV) &&
+ isNullConstant(RHS) &&
+ (CCVal == ISD::CondCode::SETLT || CCVal == ISD::CondCode::SETGE)) {
+ if (CCVal == ISD::CondCode::SETGE)
+ std::swap(TrueV, FalseV);
+
+ int64_t TrueSImm = cast<ConstantSDNode>(TrueV)->getSExtValue();
+ int64_t FalseSImm = cast<ConstantSDNode>(FalseV)->getSExtValue();
+ // Only handle simm12, if it is not in this range, it can be considered as
+ // register.
+ if (isInt<12>(TrueSImm) && isInt<12>(FalseSImm) &&
+ isInt<12>(TrueSImm - FalseSImm)) {
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, DL, VT, LHS,
+ DAG.getConstant(Subtarget.getGRLen() - 1, DL, VT));
+ SDValue AND =
+ DAG.getNode(ISD::AND, DL, VT, SRA,
+ DAG.getSignedConstant(TrueSImm - FalseSImm, DL, VT));
+ return DAG.getNode(ISD::ADD, DL, VT, AND, FalseV);
+ }
+
+ if (CCVal == ISD::CondCode::SETGE)
+ std::swap(TrueV, FalseV);
+ }
+
+ if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget))
+ return DAG.getNode(LoongArchISD::SELECT_CC, DL, N->getValueType(0),
+ {LHS, RHS, CC, TrueV, FalseV});
+
+ return SDValue();
+}
+
template <unsigned N>
static SDValue legalizeIntrinsicImmArg(SDNode *Node, unsigned ImmOp,
SelectionDAG &DAG,
@@ -5828,6 +6245,42 @@ performSPLIT_PAIR_F64Combine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue
+performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const LoongArchSubtarget &Subtarget) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ MVT EltVT = N->getSimpleValueType(0);
+ SDValue Vec = N->getOperand(0);
+ EVT VecTy = Vec->getValueType(0);
+ SDValue Idx = N->getOperand(1);
+ unsigned IdxOp = Idx.getOpcode();
+ SDLoc DL(N);
+
+ if (!VecTy.is256BitVector() || isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ // Combine:
+ // t2 = truncate t1
+ // t3 = {zero/sign/any}_extend t2
+ // t4 = extract_vector_elt t0, t3
+ // to:
+ // t4 = extract_vector_elt t0, t1
+ if (IdxOp == ISD::ZERO_EXTEND || IdxOp == ISD::SIGN_EXTEND ||
+ IdxOp == ISD::ANY_EXTEND) {
+ SDValue IdxOrig = Idx.getOperand(0);
+ if (!(IdxOrig.getOpcode() == ISD::TRUNCATE))
+ return SDValue();
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec,
+ IdxOrig.getOperand(0));
+ }
+
+ return SDValue();
+}
+
SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -5846,6 +6299,10 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
return performBITCASTCombine(N, DAG, DCI, Subtarget);
case LoongArchISD::BITREV_W:
return performBITREV_WCombine(N, DAG, DCI, Subtarget);
+ case LoongArchISD::BR_CC:
+ return performBR_CCCombine(N, DAG, DCI, Subtarget);
+ case LoongArchISD::SELECT_CC:
+ return performSELECT_CCCombine(N, DAG, DCI, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
return performINTRINSIC_WO_CHAINCombine(N, DAG, DCI, Subtarget);
case LoongArchISD::MOVGR2FR_W_LA64:
@@ -5857,6 +6314,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
return performVMSKLTZCombine(N, DAG, DCI, Subtarget);
case LoongArchISD::SPLIT_PAIR_F64:
return performSPLIT_PAIR_F64Combine(N, DAG, DCI, Subtarget);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return performEXTRACT_VECTOR_ELTCombine(N, DAG, DCI, Subtarget);
}
return SDValue();
}
@@ -6575,6 +7034,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TAIL_MEDIUM)
NODE_NAME_CASE(TAIL_LARGE)
NODE_NAME_CASE(SELECT_CC)
+ NODE_NAME_CASE(BR_CC)
+ NODE_NAME_CASE(BRCOND)
NODE_NAME_CASE(SLL_W)
NODE_NAME_CASE(SRA_W)
NODE_NAME_CASE(SRL_W)
@@ -6637,6 +7098,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VREPLVEI)
NODE_NAME_CASE(VREPLGR2VR)
NODE_NAME_CASE(XVPERMI)
+ NODE_NAME_CASE(XVPERM)
NODE_NAME_CASE(VPICK_SEXT_ELT)
NODE_NAME_CASE(VPICK_ZEXT_ELT)
NODE_NAME_CASE(VREPLVE)
@@ -6659,6 +7121,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(XVMSKGEZ)
NODE_NAME_CASE(XVMSKEQZ)
NODE_NAME_CASE(XVMSKNEZ)
+ NODE_NAME_CASE(VHADDW)
}
#undef NODE_NAME_CASE
return nullptr;
@@ -7132,6 +7595,7 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
llvm_unreachable("Unsupported calling convention");
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::PreserveMost:
break;
case CallingConv::GHC:
if (!MF.getSubtarget().hasFeature(LoongArch::FeatureBasicF) ||
@@ -7893,7 +8357,7 @@ LoongArchTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (Size < 32 && (AI->getOperation() == AtomicRMWInst::And ||
AI->getOperation() == AtomicRMWInst::Or ||
AI->getOperation() == AtomicRMWInst::Xor))
- return AtomicExpansionKind::Expand;
+ return AtomicExpansionKind::CustomExpand;
if (AI->getOperation() == AtomicRMWInst::Nand || Size < 32)
return AtomicExpansionKind::CmpXChg;
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index f79ba7450cc3..9d14934a9d36 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -37,6 +37,10 @@ enum NodeType : unsigned {
// Select
SELECT_CC,
+ // Branch
+ BR_CC,
+ BRCOND,
+
// 32-bit shifts, directly matching the semantics of the named LoongArch
// instructions.
SLL_W,
@@ -141,6 +145,7 @@ enum NodeType : unsigned {
VREPLVEI,
VREPLGR2VR,
XVPERMI,
+ XVPERM,
// Extended vector element extraction
VPICK_SEXT_ELT,
@@ -177,6 +182,9 @@ enum NodeType : unsigned {
XVMSKEQZ,
XVMSKNEZ,
+ // Vector Horizontal Addition with Widening‌
+ VHADDW
+
// Intrinsic operations end =============================================
};
} // end namespace LoongArchISD
@@ -382,10 +390,13 @@ private:
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 26d36f1c5058..c89212dae72d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -25,8 +25,8 @@ using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "LoongArchGenInstrInfo.inc"
-LoongArchInstrInfo::LoongArchInstrInfo(LoongArchSubtarget &STI)
- : LoongArchGenInstrInfo(LoongArch::ADJCALLSTACKDOWN,
+LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI)
+ : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN,
LoongArch::ADJCALLSTACKUP),
STI(STI) {}
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index 63b7112b8b40..f25958a32bec 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -25,7 +25,7 @@ class LoongArchSubtarget;
class LoongArchInstrInfo : public LoongArchGenInstrInfo {
public:
- explicit LoongArchInstrInfo(LoongArchSubtarget &STI);
+ explicit LoongArchInstrInfo(const LoongArchSubtarget &STI);
MCInst getNop() const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 2b94e65cac0e..20ccc622f58d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -31,6 +31,10 @@ def SDT_LoongArchSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
SDTCisSameAs<0, 4>,
SDTCisSameAs<4, 5>]>;
+def SDT_LoongArchBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
+ SDTCisVT<2, OtherVT>,
+ SDTCisVT<3, OtherVT>]>;
+
def SDT_LoongArchBStrIns: SDTypeProfile<1, 4, [
SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>,
SDTCisSameAs<3, 4>
@@ -94,6 +98,8 @@ def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
def loongarch_selectcc : SDNode<"LoongArchISD::SELECT_CC", SDT_LoongArchSelectCC>;
+def loongarch_brcc : SDNode<"LoongArchISD::BR_CC", SDT_LoongArchBrCC,
+ [SDNPHasChain]>;
def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
@@ -1537,47 +1543,29 @@ def : Pat<(select GPR:$cond, GPR:$t, GPR:$f),
/// Branches and jumps
-class BccPat<PatFrag CondOp, LAInst Inst>
- : Pat<(brcond (GRLenVT (CondOp GPR:$rj, GPR:$rd)), bb:$imm16),
- (Inst GPR:$rj, GPR:$rd, bb:$imm16)>;
-
-def : BccPat<seteq, BEQ>;
-def : BccPat<setne, BNE>;
-def : BccPat<setlt, BLT>;
-def : BccPat<setge, BGE>;
-def : BccPat<setult, BLTU>;
-def : BccPat<setuge, BGEU>;
-
-class BccSwapPat<PatFrag CondOp, LAInst InstBcc>
- : Pat<(brcond (GRLenVT (CondOp GPR:$rd, GPR:$rj)), bb:$imm16),
- (InstBcc GPR:$rj, GPR:$rd, bb:$imm16)>;
-
-// Condition codes that don't have matching LoongArch branch instructions, but
-// are trivially supported by swapping the two input operands.
-def : BccSwapPat<setgt, BLT>;
-def : BccSwapPat<setle, BGE>;
-def : BccSwapPat<setugt, BLTU>;
-def : BccSwapPat<setule, BGEU>;
-
let Predicates = [Has32S] in {
-// An extra pattern is needed for a brcond without a setcc (i.e. where the
-// condition was calculated elsewhere).
-def : Pat<(brcond GPR:$rj, bb:$imm21), (BNEZ GPR:$rj, bb:$imm21)>;
-
-def : Pat<(brcond (GRLenVT (seteq GPR:$rj, 0)), bb:$imm21),
- (BEQZ GPR:$rj, bb:$imm21)>;
-def : Pat<(brcond (GRLenVT (setne GPR:$rj, 0)), bb:$imm21),
- (BNEZ GPR:$rj, bb:$imm21)>;
+class BccZeroPat<CondCode Cond, LAInst Inst>
+ : Pat<(loongarch_brcc (GRLenVT GPR:$rj), 0, Cond, bb:$imm21),
+ (Inst GPR:$rj, bb:$imm21)>;
+
+def : BccZeroPat<SETEQ, BEQZ>;
+def : BccZeroPat<SETNE, BNEZ>;
} // Predicates = [Has32S]
-// An extra pattern is needed for a brcond without a setcc (i.e. where the
-// condition was calculated elsewhere).
-def : Pat<(brcond GPR:$rj, bb:$imm16), (BNE GPR:$rj, R0, bb:$imm16)>;
+multiclass BccPat<CondCode Cond, LAInst Inst> {
+ def : Pat<(loongarch_brcc (GRLenVT GPR:$rj), GPR:$rd, Cond, bb:$imm16),
+ (Inst GPR:$rj, GPR:$rd, bb:$imm16)>;
+ // Explicitly select 0 to R0. The register coalescer doesn't always do it.
+ def : Pat<(loongarch_brcc (GRLenVT GPR:$rj), 0, Cond, bb:$imm16),
+ (Inst GPR:$rj, (GRLenVT R0), bb:$imm16)>;
+}
-def : Pat<(brcond (GRLenVT (seteq GPR:$rj, 0)), bb:$imm16),
- (BEQ GPR:$rj, R0, bb:$imm16)>;
-def : Pat<(brcond (GRLenVT (setne GPR:$rj, 0)), bb:$imm16),
- (BNE GPR:$rj, R0, bb:$imm16)>;
+defm : BccPat<SETEQ, BEQ>;
+defm : BccPat<SETNE, BNE>;
+defm : BccPat<SETLT, BLT>;
+defm : BccPat<SETGE, BGE>;
+defm : BccPat<SETULT, BLTU>;
+defm : BccPat<SETUGE, BGEU>;
let isBarrier = 1, isBranch = 1, isTerminator = 1 in
def PseudoBR : Pseudo<(outs), (ins simm26_b:$imm26), [(br bb:$imm26)]>,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 0696b11d62ac..a79c01cbe577 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -10,8 +10,12 @@
//
//===----------------------------------------------------------------------===//
+def SDT_LoongArchXVPERM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisVec<2>, SDTCisInt<2>]>;
+
// Target nodes.
def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>;
+def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>;
def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>;
def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>;
def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>;
@@ -1186,6 +1190,17 @@ multiclass PatXrXrXr<SDPatternOperator OpNode, string Inst> {
(!cast<LAInst>(Inst#"_D") LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
}
+multiclass PatXrXrW<SDPatternOperator OpNode, string Inst> {
+ def : Pat<(OpNode(v32i8 LASX256:$vj), (v32i8 LASX256:$vk)),
+ (!cast<LAInst>(Inst#"_H_B") LASX256:$vj, LASX256:$vk)>;
+ def : Pat<(OpNode(v16i16 LASX256:$vj), (v16i16 LASX256:$vk)),
+ (!cast<LAInst>(Inst#"_W_H") LASX256:$vj, LASX256:$vk)>;
+ def : Pat<(OpNode(v8i32 LASX256:$vj), (v8i32 LASX256:$vk)),
+ (!cast<LAInst>(Inst#"_D_W") LASX256:$vj, LASX256:$vk)>;
+ def : Pat<(OpNode(v4i64 LASX256:$vj), (v4i64 LASX256:$vk)),
+ (!cast<LAInst>(Inst#"_Q_D") LASX256:$vj, LASX256:$vk)>;
+}
+
multiclass PatShiftXrXr<SDPatternOperator OpNode, string Inst> {
def : Pat<(OpNode (v32i8 LASX256:$xj), (and vsplati8_imm_eq_7,
(v32i8 LASX256:$xk))),
@@ -1513,6 +1528,9 @@ def : Pat<(bswap (v8i32 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b00011011)>;
def : Pat<(bswap (v4i64 LASX256:$xj)),
(XVSHUF4I_W (XVSHUF4I_B LASX256:$xj, 0b00011011), 0b10110001)>;
+// XVHADDW_{H_B/W_H/D_W/Q_D}
+defm : PatXrXrW<loongarch_vhaddw, "XVHADDW">;
+
// XVFADD_{S/D}
defm : PatXrXrF<fadd, "XVFADD">;
@@ -1852,6 +1870,12 @@ def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
(XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
+// XVPERM_W
+def : Pat<(loongarch_xvperm v8i32:$xj, v8i32:$xk),
+ (XVPERM_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_xvperm v8f32:$xj, v8i32:$xk),
+ (XVPERM_W v8f32:$xj, v8i32:$xk)>;
+
// XVREPLVE0_{W/D}
def : Pat<(lasxsplatf32 FPR32:$fj),
(XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 3c9defb0366f..eb7120ffb41a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -22,7 +22,7 @@ def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>,
def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>,
- SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>;
+ SDTCisSameAs<0,1>, SDTCisVT<2, GRLenVT>]>;
def SDT_LoongArchV2RUimm
: SDTypeProfile<1, 3,
[SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
@@ -71,6 +71,8 @@ def loongarch_vsrli : SDNode<"LoongArchISD::VSRLI", SDT_LoongArchV1RUimm>;
def loongarch_vbsll : SDNode<"LoongArchISD::VBSLL", SDT_LoongArchV1RUimm>;
def loongarch_vbsrl : SDNode<"LoongArchISD::VBSRL", SDT_LoongArchV1RUimm>;
+def loongarch_vhaddw : SDNode<"LoongArchISD::VHADDW", SDT_LoongArchV2R>;
+
def loongarch_vldrepl
: SDNode<"LoongArchISD::VLDREPL",
SDT_LoongArchVLDREPL, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -1364,6 +1366,17 @@ multiclass PatVrVrVr<SDPatternOperator OpNode, string Inst> {
(!cast<LAInst>(Inst#"_D") LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
}
+multiclass PatVrVrW<SDPatternOperator OpNode, string Inst> {
+ def : Pat<(OpNode(v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
+ (!cast<LAInst>(Inst#"_H_B") LSX128:$vj, LSX128:$vk)>;
+ def : Pat<(OpNode(v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
+ (!cast<LAInst>(Inst#"_W_H") LSX128:$vj, LSX128:$vk)>;
+ def : Pat<(OpNode(v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
+ (!cast<LAInst>(Inst#"_D_W") LSX128:$vj, LSX128:$vk)>;
+ def : Pat<(OpNode(v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
+ (!cast<LAInst>(Inst#"_Q_D") LSX128:$vj, LSX128:$vk)>;
+}
+
multiclass PatShiftVrVr<SDPatternOperator OpNode, string Inst> {
def : Pat<(OpNode (v16i8 LSX128:$vj), (and vsplati8_imm_eq_7,
(v16i8 LSX128:$vk))),
@@ -1709,6 +1722,9 @@ def : Pat<(bswap (v4i32 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b00011011)>;
def : Pat<(bswap (v2i64 LSX128:$vj)),
(VSHUF4I_W (VSHUF4I_B LSX128:$vj, 0b00011011), 0b10110001)>;
+// VHADDW_{H_B/W_H/D_W/Q_D}
+defm : PatVrVrW<loongarch_vhaddw, "VHADDW">;
+
// VFADD_{S/D}
defm : PatVrVrF<fadd, "VFADD">;
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
index 47fce37ce59f..9c5f8edfaf66 100644
--- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
@@ -41,6 +41,8 @@ LoongArchRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
return CSR_NoRegs_SaveList;
+ if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+ return CSR_MostRegs_SaveList;
switch (Subtarget.getTargetABI()) {
default:
llvm_unreachable("Unrecognized ABI");
@@ -63,6 +65,8 @@ LoongArchRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
if (CC == CallingConv::GHC)
return CSR_NoRegs_RegMask;
+ if (CC == CallingConv::PreserveMost)
+ return CSR_MostRegs_RegMask;
switch (Subtarget.getTargetABI()) {
default:
llvm_unreachable("Unrecognized ABI");
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index ede5477f04bd..f548a8dd0532 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -95,4 +95,20 @@ unsigned LoongArchTTIImpl::getPrefetchDistance() const { return 200; }
bool LoongArchTTIImpl::enableWritePrefetching() const { return true; }
+bool LoongArchTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
+ switch (II->getIntrinsicID()) {
+ default:
+ return true;
+ case Intrinsic::vector_reduce_add:
+ case Intrinsic::vector_reduce_and:
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_smax:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_umax:
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_xor:
+ return false;
+ }
+}
+
// TODO: Implement more hooks to provide TTI machinery for LoongArch.
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index d43d2cb0eb12..e3f16c780499 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -53,6 +53,8 @@ public:
unsigned getPrefetchDistance() const override;
bool enableWritePrefetching() const override;
+ bool shouldExpandReduction(const IntrinsicInst *II) const override;
+
// TODO: Implement more hooks to provide TTI machinery for LoongArch.
};
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index 35277ce094a7..e5bd1c91edec 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -26,6 +26,7 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/Compiler.h"
+#include <bitset>
#define GET_INSTRINFO_MC_DESC
#define ENABLE_INSTR_PREDICATE_VERIFIER
@@ -95,10 +96,81 @@ createLoongArchAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
namespace {
class LoongArchMCInstrAnalysis : public MCInstrAnalysis {
+ int64_t GPRState[31] = {};
+ std::bitset<31> GPRValidMask;
+
+ static bool isGPR(MCRegister Reg) {
+ return Reg >= LoongArch::R0 && Reg <= LoongArch::R31;
+ }
+
+ static unsigned getRegIndex(MCRegister Reg) {
+ assert(isGPR(Reg) && Reg != LoongArch::R0 && "Invalid GPR reg");
+ return Reg - LoongArch::R1;
+ }
+
+ void setGPRState(MCRegister Reg, std::optional<int64_t> Value) {
+ if (Reg == LoongArch::R0)
+ return;
+
+ auto Index = getRegIndex(Reg);
+
+ if (Value) {
+ GPRState[Index] = *Value;
+ GPRValidMask.set(Index);
+ } else {
+ GPRValidMask.reset(Index);
+ }
+ }
+
+ std::optional<int64_t> getGPRState(MCRegister Reg) const {
+ if (Reg == LoongArch::R0)
+ return 0;
+
+ auto Index = getRegIndex(Reg);
+
+ if (GPRValidMask.test(Index))
+ return GPRState[Index];
+ return std::nullopt;
+ }
+
public:
explicit LoongArchMCInstrAnalysis(const MCInstrInfo *Info)
: MCInstrAnalysis(Info) {}
+ void resetState() override { GPRValidMask.reset(); }
+
+ void updateState(const MCInst &Inst, uint64_t Addr) override {
+ // Terminators mark the end of a basic block which means the sequentially
+ // next instruction will be the first of another basic block and the current
+ // state will typically not be valid anymore. For calls, we assume all
+ // registers may be clobbered by the callee (TODO: should we take the
+ // calling convention into account?).
+ if (isTerminator(Inst) || isCall(Inst)) {
+ resetState();
+ return;
+ }
+
+ switch (Inst.getOpcode()) {
+ default: {
+ // Clear the state of all defined registers for instructions that we don't
+ // explicitly support.
+ auto NumDefs = Info->get(Inst.getOpcode()).getNumDefs();
+ for (unsigned I = 0; I < NumDefs; ++I) {
+ auto DefReg = Inst.getOperand(I).getReg();
+ if (isGPR(DefReg))
+ setGPRState(DefReg, std::nullopt);
+ }
+ break;
+ }
+ case LoongArch::PCADDU18I:
+ setGPRState(
+ Inst.getOperand(0).getReg(),
+ Addr + SignExtend64<38>(
+ static_cast<uint64_t>(Inst.getOperand(1).getImm()) << 18));
+ break;
+ }
+ }
+
bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
uint64_t &Target) const override {
unsigned NumOps = Inst.getNumOperands();
@@ -108,6 +180,14 @@ public:
return true;
}
+ if (Inst.getOpcode() == LoongArch::JIRL) {
+ if (auto TargetRegState = getGPRState(Inst.getOperand(1).getReg())) {
+ Target = *TargetRegState + Inst.getOperand(2).getImm();
+ return true;
+ }
+ return false;
+ }
+
return false;
}
diff --git a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
index d3ad65390143..4992f1abe5a0 100644
--- a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
+++ b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
@@ -107,6 +107,18 @@ static DecodeStatus DecodeFPCSCRegisterClass(MCInst &Inst, uint64_t RegNo,
}
#define DecodeFPICRegisterClass DecodeFPCSCRegisterClass
+static DecodeStatus DecodeCCRCRegisterClass(MCInst &Inst,
+ const MCDisassembler *Decoder) {
+ Inst.addOperand(MCOperand::createReg(M68k::CCR));
+ return DecodeStatus::Success;
+}
+
+static DecodeStatus DecodeSRCRegisterClass(MCInst &Inst,
+ const MCDisassembler *Decoder) {
+ Inst.addOperand(MCOperand::createReg(M68k::SR));
+ return DecodeStatus::Success;
+}
+
static DecodeStatus DecodeImm32(MCInst &Inst, uint64_t Imm, uint64_t Address,
const void *Decoder) {
Inst.addOperand(MCOperand::createImm(M68k::swapWord<uint32_t>(Imm)));
diff --git a/llvm/lib/Target/M68k/M68kInstrAtomics.td b/llvm/lib/Target/M68k/M68kInstrAtomics.td
index 867afbefe68f..b2b64ca85322 100644
--- a/llvm/lib/Target/M68k/M68kInstrAtomics.td
+++ b/llvm/lib/Target/M68k/M68kInstrAtomics.td
@@ -67,7 +67,8 @@ class MxCASARIDOp<bits<2> size_encoding, MxType type>
"cas."#type.Prefix#" $dc, $du, $mem"> {
let Inst = (ascend
(descend 0b00001, size_encoding, 0b011, MxEncAddrMode_p<"mem">.EA),
- (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3))
+ (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)),
+ MxEncAddrMode_p<"mem">.Supplement
);
let Constraints = "$out = $dc";
let mayLoad = 1;
@@ -84,7 +85,8 @@ class MxCASARIIOp<bits<2> size_encoding, MxType type>
"cas."#type.Prefix#" $dc, $du, $mem"> {
let Inst = (ascend
(descend 0b00001, size_encoding, 0b011, MxEncAddrMode_f<"mem">.EA),
- (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3))
+ (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)),
+ MxEncAddrMode_f<"mem">.Supplement
);
let Constraints = "$out = $dc";
let mayLoad = 1;
@@ -100,8 +102,9 @@ class MxCASALOp<bits<2> size_encoding, MxType type>
(ins type.ROp:$dc, type.ROp:$du, !cast<MxMemOp>("MxAL"#type.Size):$mem),
"cas."#type.Prefix#" $dc, $du, $mem"> {
let Inst = (ascend
- (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_abs<"mem">.EA),
- (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3))
+ (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_abs<"mem", true>.EA),
+ (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)),
+ MxEncAddrMode_abs<"mem", true>.Supplement
);
let Constraints = "$out = $dc";
let mayLoad = 1;
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index 21e9319aaf0b..c6be190bd124 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -43,7 +43,7 @@ using namespace llvm;
void M68kInstrInfo::anchor() {}
M68kInstrInfo::M68kInstrInfo(const M68kSubtarget &STI)
- : M68kGenInstrInfo(M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0,
+ : M68kGenInstrInfo(STI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0,
M68k::RET),
Subtarget(STI), RI(STI) {}
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 38d22eda5f17..a31c8ec1b2bb 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -36,7 +36,6 @@ namespace {
/// Parses MSP430 assembly from a stream.
class MSP430AsmParser : public MCTargetAsmParser {
- const MCSubtargetInfo &STI;
MCAsmParser &Parser;
const MCRegisterInfo *MRI;
@@ -79,7 +78,7 @@ class MSP430AsmParser : public MCTargetAsmParser {
public:
MSP430AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
- : MCTargetAsmParser(Options, STI, MII), STI(STI), Parser(Parser) {
+ : MCTargetAsmParser(Options, STI, MII), Parser(Parser) {
MCAsmParserExtension::Initialize(Parser);
MRI = getContext().getRegisterInfo();
@@ -264,7 +263,7 @@ bool MSP430AsmParser::matchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
switch (MatchResult) {
case Match_Success:
Inst.setLoc(Loc);
- Out.emitInstruction(Inst, STI);
+ Out.emitInstruction(Inst, *STI);
return false;
case Match_MnemonicFail:
return Error(Loc, "invalid instruction mnemonic");
diff --git a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
index c8094a8eeb36..e6666e8cafdf 100644
--- a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
+++ b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -103,15 +103,6 @@ static DecodeStatus DecodeGR16RegisterClass(MCInst &MI, uint64_t RegNo,
}
static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-#include "MSP430GenDisassemblerTables.inc"
-
-static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
const MCDisassembler *Decoder) {
int64_t Imm;
switch (Bits) {
@@ -142,6 +133,8 @@ static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
return MCDisassembler::Success;
}
+#include "MSP430GenDisassemblerTables.inc"
+
enum AddrMode {
amInvalid = 0,
amRegister,
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 6da5e66be4ad..5653099431b1 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -154,9 +154,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
const RTLIB::LibcallImpl Impl;
} LibraryCalls[] = {
// Integer Multiply - EABI Table 9
- {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi_hw},
- {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl_hw},
- {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll_hw},
+ {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi_hw},
+ {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl_hw},
+ {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll_hw},
// TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc
// TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc
};
@@ -169,9 +169,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
const RTLIB::LibcallImpl Impl;
} LibraryCalls[] = {
// Integer Multiply - EABI Table 9
- {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi_hw},
- {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl_hw32},
- {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll_hw32},
+ {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi_hw},
+ {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl_hw32},
+ {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll_hw32},
// TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc
// TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc
};
@@ -184,9 +184,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
const RTLIB::LibcallImpl Impl;
} LibraryCalls[] = {
// Integer Multiply - EABI Table 9
- {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi_f5hw},
- {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl_f5hw},
- {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll_f5hw},
+ {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi_f5hw},
+ {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl_f5hw},
+ {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll_f5hw},
// TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc
// TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc
};
@@ -199,9 +199,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
const RTLIB::LibcallImpl Impl;
} LibraryCalls[] = {
// Integer Multiply - EABI Table 9
- {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi},
- {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl},
- {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll},
+ {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi},
+ {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl},
+ {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll},
// The __mspabi_mpysl* functions are NOT implemented in libgcc
// The __mspabi_mpyul* functions are NOT implemented in libgcc
};
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index 8bc6387e6a7e..65b4820752c9 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -12,6 +12,7 @@
#include "MSP430InstrInfo.h"
#include "MSP430.h"
+#include "MSP430Subtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/Support/ErrorHandling.h"
@@ -24,9 +25,9 @@ using namespace llvm;
// Pin the vtable to this file.
void MSP430InstrInfo::anchor() {}
-MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI)
- : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
- RI() {}
+MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI)
+ : MSP430GenInstrInfo(STI, MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+ RI() {}
void MSP430InstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
index 58be64336f26..316c136890bf 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -27,7 +27,7 @@ class MSP430InstrInfo : public MSP430GenInstrInfo {
const MSP430RegisterInfo RI;
virtual void anchor();
public:
- explicit MSP430InstrInfo(MSP430Subtarget &STI);
+ explicit MSP430InstrInfo(const MSP430Subtarget &STI);
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/Mips/CMakeLists.txt b/llvm/lib/Target/Mips/CMakeLists.txt
index 21d1765107ae..4a2277e9a80d 100644
--- a/llvm/lib/Target/Mips/CMakeLists.txt
+++ b/llvm/lib/Target/Mips/CMakeLists.txt
@@ -6,7 +6,8 @@ tablegen(LLVM MipsGenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM MipsGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM MipsGenCallingConv.inc -gen-callingconv)
tablegen(LLVM MipsGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler
+ -ignore-non-decodable-operands)
tablegen(LLVM MipsGenFastISel.inc -gen-fast-isel)
tablegen(LLVM MipsGenGlobalISel.inc -gen-global-isel)
tablegen(LLVM MipsGenPostLegalizeGICombiner.inc -gen-global-isel-combiner
diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 0c98c4da2ede..fa6cc0e3f018 100644
--- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -78,451 +78,216 @@ public:
} // end anonymous namespace
-// Forward declare these because the autogenerated code will reference them.
-// Definitions are further down.
-static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeGPRMM16ZeroRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeGPRMM16MovePRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodePtrRegisterClass(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
+static MCDisassembler *createMipsDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new MipsDisassembler(STI, Ctx, true);
+}
-static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
+static MCDisassembler *createMipselDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new MipsDisassembler(STI, Ctx, false);
+}
-static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsDisassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(),
+ createMipsDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheMipselTarget(),
+ createMipselDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheMips64Target(),
+ createMipsDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getTheMips64elTarget(),
+ createMipselDisassembler);
+}
-static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned Insn,
+static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
+ const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
+ return RegInfo->getRegClass(RC).getRegister(RegNo);
+}
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ // Currently only hardware register 29 is supported.
+ if (RegNo != 29)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(Mips::HWR29));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo > 30 || RegNo % 2)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo / 2);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo >= 4)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo >= 4)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo >= 4)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo > 7)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget(MCInst &Inst, unsigned Offset,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst, unsigned Offset,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeJumpTarget(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget21(MCInst &Inst, unsigned Offset,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst, unsigned Offset,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget26(MCInst &Inst, unsigned Offset,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-// DecodeBranchTarget7MM - Decode microMIPS branch offset, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst, unsigned Offset,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-// DecodeBranchTarget10MM - Decode microMIPS branch offset, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst, unsigned Offset,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-// DecodeBranchTargetMM - Decode microMIPS branch offset, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, unsigned Offset,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, unsigned Offset,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-// DecodeJumpTargetMM - Decode microMIPS jump target, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-// DecodeJumpTargetXMM - Decode microMIPS jump and link exchange target,
-// which is shifted left by 2 bit.
-static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemEVA(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeLoadByte15(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCacheOp(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSynciR6(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMImm4(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMImm9(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMImm16(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst, unsigned Value,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeLi16Imm(MCInst &Inst, unsigned Value,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst, unsigned Value,
- uint64_t Address,
- const MCDisassembler *Decoder);
+ const MCDisassembler *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
-template <unsigned Bits, int Offset, int Scale>
-static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
- uint64_t Address,
- const MCDisassembler *Decoder);
+ unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
-template <unsigned Bits, int Offset>
-static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
- return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address,
- Decoder);
-}
-
-template <unsigned Bits, int Offset = 0, int ScaleBy = 1>
-static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-/// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
-/// handle.
-template <typename InsnType>
-static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
+ unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4,
+ Mips::S5, Mips::S6, Mips::S7, Mips::FP};
+ unsigned RegNum;
-template <typename InsnType>
-static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
+ unsigned RegLst = fieldFromInstruction(Insn, 21, 5);
-template <typename InsnType>
-static DecodeStatus DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
+ // Empty register lists are not allowed.
+ if (RegLst == 0)
+ return MCDisassembler::Fail;
-template <typename InsnType>
-static DecodeStatus DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
+ RegNum = RegLst & 0xf;
-template <typename InsnType>
-static DecodeStatus DecodeDINS(MCInst &MI, InsnType Insn, uint64_t Address,
- const MCDisassembler *Decoder);
+ // RegLst values 10-15, and 26-31 are reserved.
+ if (RegNum > 9)
+ return MCDisassembler::Fail;
-template <typename InsnType>
-static DecodeStatus DecodeDEXT(MCInst &MI, InsnType Insn, uint64_t Address,
- const MCDisassembler *Decoder);
+ for (unsigned i = 0; i < RegNum; i++)
+ Inst.addOperand(MCOperand::createReg(Regs[i]));
-template <typename InsnType>
-static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
- const MCDisassembler *Decoder);
+ if (RegLst & 0x10)
+ Inst.addOperand(MCOperand::createReg(Mips::RA));
-static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
+ return MCDisassembler::Success;
+}
static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFIXMEInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static MCDisassembler *createMipsDisassembler(
- const Target &T,
- const MCSubtargetInfo &STI,
- MCContext &Ctx) {
- return new MipsDisassembler(STI, Ctx, true);
-}
-
-static MCDisassembler *createMipselDisassembler(
- const Target &T,
- const MCSubtargetInfo &STI,
- MCContext &Ctx) {
- return new MipsDisassembler(STI, Ctx, false);
-}
+ const MCDisassembler *Decoder) {
+ unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
+ unsigned RegLst;
+ switch (Inst.getOpcode()) {
+ default:
+ RegLst = fieldFromInstruction(Insn, 4, 2);
+ break;
+ case Mips::LWM16_MMR6:
+ case Mips::SWM16_MMR6:
+ RegLst = fieldFromInstruction(Insn, 8, 2);
+ break;
+ }
+ unsigned RegNum = RegLst & 0x3;
-extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
-LLVMInitializeMipsDisassembler() {
- // Register the disassembler.
- TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(),
- createMipsDisassembler);
- TargetRegistry::RegisterMCDisassembler(getTheMipselTarget(),
- createMipselDisassembler);
- TargetRegistry::RegisterMCDisassembler(getTheMips64Target(),
- createMipsDisassembler);
- TargetRegistry::RegisterMCDisassembler(getTheMips64elTarget(),
- createMipselDisassembler);
-}
+ for (unsigned i = 0; i <= RegNum; i++)
+ Inst.addOperand(MCOperand::createReg(Regs[i]));
-#include "MipsGenDisassemblerTables.inc"
+ Inst.addOperand(MCOperand::createReg(Mips::RA));
-static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
- const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
- return *(RegInfo->getRegClass(RC).begin() + RegNo);
+ return MCDisassembler::Success;
}
template <typename InsnType>
@@ -1095,247 +860,15 @@ static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
const MCDisassembler *Decoder) {
InsnType Rs = fieldFromInstruction(Insn, 21, 5);
InsnType Rt = fieldFromInstruction(Insn, 16, 5);
- MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
- Rt)));
- MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
- Rs)));
- MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
- Rt)));
- return MCDisassembler::Success;
-}
-
-/// Read two bytes from the ArrayRef and return 16 bit halfword sorted
-/// according to the given endianness.
-static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
- uint64_t &Size, uint32_t &Insn,
- bool IsBigEndian) {
- // We want to read exactly 2 Bytes of data.
- if (Bytes.size() < 2) {
- Size = 0;
- return MCDisassembler::Fail;
- }
-
- if (IsBigEndian) {
- Insn = (Bytes[0] << 8) | Bytes[1];
- } else {
- Insn = (Bytes[1] << 8) | Bytes[0];
- }
-
- return MCDisassembler::Success;
-}
-
-/// Read four bytes from the ArrayRef and return 32 bit word sorted
-/// according to the given endianness.
-static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
- uint64_t &Size, uint32_t &Insn,
- bool IsBigEndian, bool IsMicroMips) {
- // We want to read exactly 4 Bytes of data.
- if (Bytes.size() < 4) {
- Size = 0;
- return MCDisassembler::Fail;
- }
-
- // High 16 bits of a 32-bit microMIPS instruction (where the opcode is)
- // always precede the low 16 bits in the instruction stream (that is, they
- // are placed at lower addresses in the instruction stream).
- //
- // microMIPS byte ordering:
- // Big-endian: 0 | 1 | 2 | 3
- // Little-endian: 1 | 0 | 3 | 2
-
- if (IsBigEndian) {
- // Encoded as a big-endian 32-bit word in the stream.
- Insn =
- (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24);
- } else {
- if (IsMicroMips) {
- Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) |
- (Bytes[1] << 24);
- } else {
- Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
- (Bytes[3] << 24);
- }
- }
-
+ MI.addOperand(
+ MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
+ MI.addOperand(
+ MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rs)));
+ MI.addOperand(
+ MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
return MCDisassembler::Success;
}
-DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
- ArrayRef<uint8_t> Bytes,
- uint64_t Address,
- raw_ostream &CStream) const {
- uint32_t Insn;
- DecodeStatus Result;
- Size = 0;
-
- if (IsMicroMips) {
- Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
- if (Result == MCDisassembler::Fail)
- return MCDisassembler::Fail;
-
- if (hasMips32r6()) {
- LLVM_DEBUG(
- dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
- // Calling the auto-generated decoder function for microMIPS32R6
- // 16-bit instructions.
- Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 2;
- return Result;
- }
- }
-
- LLVM_DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
- // Calling the auto-generated decoder function for microMIPS 16-bit
- // instructions.
- Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
- this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 2;
- return Result;
- }
-
- Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, true);
- if (Result == MCDisassembler::Fail)
- return MCDisassembler::Fail;
-
- if (hasMips32r6()) {
- LLVM_DEBUG(
- dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
- // Calling the auto-generated decoder function.
- Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- return Result;
- }
- }
-
- LLVM_DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
- // Calling the auto-generated decoder function.
- Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
- this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- return Result;
- }
-
- if (isFP64()) {
- LLVM_DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n");
- Result = decodeInstruction(DecoderTableMicroMipsFP6432, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail) {
- Size = 4;
- return Result;
- }
- }
-
- // This is an invalid instruction. Claim that the Size is 2 bytes. Since
- // microMIPS instructions have a minimum alignment of 2, the next 2 bytes
- // could form a valid instruction. The two bytes we rejected as an
- // instruction could have actually beeen an inline constant pool that is
- // unconditionally branched over.
- Size = 2;
- return MCDisassembler::Fail;
- }
-
- // Attempt to read the instruction so that we can attempt to decode it. If
- // the buffer is not 4 bytes long, let the higher level logic figure out
- // what to do with a size of zero and MCDisassembler::Fail.
- Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
- if (Result == MCDisassembler::Fail)
- return MCDisassembler::Fail;
-
- // The only instruction size for standard encoded MIPS.
- Size = 4;
-
- if (hasCOP3()) {
- LLVM_DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
- Result =
- decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- if (hasMips32r6() && isGP64()) {
- LLVM_DEBUG(
- dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
- Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- if (hasMips32r6() && isPTR64()) {
- LLVM_DEBUG(
- dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
- Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- if (hasMips32r6()) {
- LLVM_DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
- Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- if (hasMips2() && isPTR64()) {
- LLVM_DEBUG(
- dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
- Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- if (hasCnMips()) {
- LLVM_DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
- Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- if (hasCnMipsP()) {
- LLVM_DEBUG(dbgs() << "Trying CnMipsP table (32-bit opcodes):\n");
- Result = decodeInstruction(DecoderTableCnMipsP32, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- if (isGP64()) {
- LLVM_DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
- Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- if (isFP64()) {
- LLVM_DEBUG(
- dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n");
- Result = decodeInstruction(DecoderTableMipsFP6432, Instr, Insn,
- Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
- }
-
- LLVM_DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
- // Calling the auto-generated decoder function.
- Result =
- decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
- if (Result != MCDisassembler::Fail)
- return Result;
-
- return MCDisassembler::Fail;
-}
-
static DecodeStatus
DecodeCPU16RegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
const MCDisassembler *Decoder) {
@@ -1971,137 +1504,6 @@ static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- // Currently only hardware register 29 is supported.
- if (RegNo != 29)
- return MCDisassembler::Fail;
- Inst.addOperand(MCOperand::createReg(Mips::HWR29));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 30 || RegNo %2)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo >= 4)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo >= 4)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo >= 4)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 7)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
- Inst.addOperand(MCOperand::createReg(Reg));
- return MCDisassembler::Success;
-}
-
static DecodeStatus DecodeBranchTarget(MCInst &Inst, unsigned Offset,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -2241,7 +1643,7 @@ DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address,
return MCDisassembler::Success;
}
-template <unsigned Bits, int Offset, int ScaleBy>
+template <unsigned Bits, int Offset = 0, int ScaleBy = 1>
static DecodeStatus
DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address,
const MCDisassembler *Decoder) {
@@ -2250,6 +1652,14 @@ DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address,
return MCDisassembler::Success;
}
+template <unsigned Bits, int Offset>
+static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address,
+ Decoder);
+}
+
static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address,
const MCDisassembler *Decoder) {
// First we need to grab the pos(lsb) from MCInst.
@@ -2294,90 +1704,12 @@ static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
const MCDisassembler *Decoder) {
// Insn must be >= 0, since it is unsigned that condition is always true.
assert(Insn < 16);
- int32_t DecodedValues[] = {128, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64,
- 255, 32768, 65535};
+ int32_t DecodedValues[] = {128, 1, 2, 3, 4, 7, 8, 15,
+ 16, 31, 32, 63, 64, 255, 32768, 65535};
Inst.addOperand(MCOperand::createImm(DecodedValues[Insn]));
return MCDisassembler::Success;
}
-static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
- Mips::S6, Mips::S7, Mips::FP};
- unsigned RegNum;
-
- unsigned RegLst = fieldFromInstruction(Insn, 21, 5);
-
- // Empty register lists are not allowed.
- if (RegLst == 0)
- return MCDisassembler::Fail;
-
- RegNum = RegLst & 0xf;
-
- // RegLst values 10-15, and 26-31 are reserved.
- if (RegNum > 9)
- return MCDisassembler::Fail;
-
- for (unsigned i = 0; i < RegNum; i++)
- Inst.addOperand(MCOperand::createReg(Regs[i]));
-
- if (RegLst & 0x10)
- Inst.addOperand(MCOperand::createReg(Mips::RA));
-
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
- unsigned RegLst;
- switch(Inst.getOpcode()) {
- default:
- RegLst = fieldFromInstruction(Insn, 4, 2);
- break;
- case Mips::LWM16_MMR6:
- case Mips::SWM16_MMR6:
- RegLst = fieldFromInstruction(Insn, 8, 2);
- break;
- }
- unsigned RegNum = RegLst & 0x3;
-
- for (unsigned i = 0; i <= RegNum; i++)
- Inst.addOperand(MCOperand::createReg(Regs[i]));
-
- Inst.addOperand(MCOperand::createReg(Mips::RA));
-
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
- if (DecodeMovePRegPair(Inst, RegPair, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
-
- unsigned RegRs;
- if (static_cast<const MipsDisassembler*>(Decoder)->hasMips32r6())
- RegRs = fieldFromInstruction(Insn, 0, 2) |
- (fieldFromInstruction(Insn, 3, 1) << 2);
- else
- RegRs = fieldFromInstruction(Insn, 1, 3);
- if (DecodeGPRMM16MovePRegisterClass(Inst, RegRs, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
-
- unsigned RegRt = fieldFromInstruction(Insn, 4, 3);
- if (DecodeGPRMM16MovePRegisterClass(Inst, RegRt, Address, Decoder) ==
- MCDisassembler::Fail)
- return MCDisassembler::Fail;
-
- return MCDisassembler::Success;
-}
-
static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -2421,6 +1753,32 @@ static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
+ if (DecodeMovePRegPair(Inst, RegPair, Address, Decoder) ==
+ MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ unsigned RegRs;
+ if (static_cast<const MipsDisassembler *>(Decoder)->hasMips32r6())
+ RegRs = fieldFromInstruction(Insn, 0, 2) |
+ (fieldFromInstruction(Insn, 3, 1) << 2);
+ else
+ RegRs = fieldFromInstruction(Insn, 1, 3);
+ if (DecodeGPRMM16MovePRegisterClass(Inst, RegRs, Address, Decoder) ==
+ MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ unsigned RegRt = fieldFromInstruction(Insn, 4, 3);
+ if (DecodeGPRMM16MovePRegisterClass(Inst, RegRt, Address, Decoder) ==
+ MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -2528,3 +1886,237 @@ static DecodeStatus DecodeFIXMEInstruction(MCInst &Inst, unsigned Insn,
const MCDisassembler *Decoder) {
return MCDisassembler::Fail;
}
+
+#include "MipsGenDisassemblerTables.inc"
+
+/// Read two bytes from the ArrayRef and return 16 bit halfword sorted
+/// according to the given endianness.
+static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint32_t &Insn,
+ bool IsBigEndian) {
+ // We want to read exactly 2 Bytes of data.
+ if (Bytes.size() < 2) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ if (IsBigEndian) {
+ Insn = (Bytes[0] << 8) | Bytes[1];
+ } else {
+ Insn = (Bytes[1] << 8) | Bytes[0];
+ }
+
+ return MCDisassembler::Success;
+}
+
+/// Read four bytes from the ArrayRef and return 32 bit word sorted
+/// according to the given endianness.
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint32_t &Insn,
+ bool IsBigEndian, bool IsMicroMips) {
+ // We want to read exactly 4 Bytes of data.
+ if (Bytes.size() < 4) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ // High 16 bits of a 32-bit microMIPS instruction (where the opcode is)
+ // always precede the low 16 bits in the instruction stream (that is, they
+ // are placed at lower addresses in the instruction stream).
+ //
+ // microMIPS byte ordering:
+ // Big-endian: 0 | 1 | 2 | 3
+ // Little-endian: 1 | 0 | 3 | 2
+
+ if (IsBigEndian) {
+ // Encoded as a big-endian 32-bit word in the stream.
+ Insn =
+ (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24);
+ } else {
+ if (IsMicroMips) {
+ Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) |
+ (Bytes[1] << 24);
+ } else {
+ Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
+ (Bytes[3] << 24);
+ }
+ }
+
+ return MCDisassembler::Success;
+}
+
+DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CStream) const {
+ uint32_t Insn;
+ DecodeStatus Result;
+ Size = 0;
+
+ if (IsMicroMips) {
+ Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
+ if (Result == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ if (hasMips32r6()) {
+ LLVM_DEBUG(
+ dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
+ // Calling the auto-generated decoder function for microMIPS32R6
+ // 16-bit instructions.
+ Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 2;
+ return Result;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
+ // Calling the auto-generated decoder function for microMIPS 16-bit
+ // instructions.
+ Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 2;
+ return Result;
+ }
+
+ Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, true);
+ if (Result == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ if (hasMips32r6()) {
+ LLVM_DEBUG(
+ dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
+ // Calling the auto-generated decoder function.
+ Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
+ // Calling the auto-generated decoder function.
+ Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+
+ if (isFP64()) {
+ LLVM_DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMicroMipsFP6432, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
+ }
+
+ // This is an invalid instruction. Claim that the Size is 2 bytes. Since
+ // microMIPS instructions have a minimum alignment of 2, the next 2 bytes
+ // could form a valid instruction. The two bytes we rejected as an
+ // instruction could have actually beeen an inline constant pool that is
+ // unconditionally branched over.
+ Size = 2;
+ return MCDisassembler::Fail;
+ }
+
+ // Attempt to read the instruction so that we can attempt to decode it. If
+ // the buffer is not 4 bytes long, let the higher level logic figure out
+ // what to do with a size of zero and MCDisassembler::Fail.
+ Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
+ if (Result == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ // The only instruction size for standard encoded MIPS.
+ Size = 4;
+
+ if (hasCOP3()) {
+ LLVM_DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
+ Result =
+ decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ if (hasMips32r6() && isGP64()) {
+ LLVM_DEBUG(
+ dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ if (hasMips32r6() && isPTR64()) {
+ LLVM_DEBUG(
+ dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ if (hasMips32r6()) {
+ LLVM_DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ if (hasMips2() && isPTR64()) {
+ LLVM_DEBUG(
+ dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ if (hasCnMips()) {
+ LLVM_DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn, Address, this,
+ STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ if (hasCnMipsP()) {
+ LLVM_DEBUG(dbgs() << "Trying CnMipsP table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableCnMipsP32, Instr, Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ if (isGP64()) {
+ LLVM_DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMips6432, Instr, Insn, Address, this,
+ STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ if (isFP64()) {
+ LLVM_DEBUG(
+ dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n");
+ Result = decodeInstruction(DecoderTableMipsFP6432, Instr, Insn, Address,
+ this, STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ LLVM_DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
+ // Calling the auto-generated decoder function.
+ Result =
+ decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+
+ return MCDisassembler::Fail;
+}
diff --git a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
index 330cb4e0e206..7bd96b571bc6 100644
--- a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -56,48 +56,52 @@ struct Mips16IntrinsicHelperType{
// Libcalls for which no helper is generated. Sorted by name for binary search.
static const Mips16Libcall HardFloatLibCalls[] = {
- {RTLIB::ADD_F64, RTLIB::__mips16_adddf3, "__mips16_adddf3"},
- {RTLIB::ADD_F32, RTLIB::__mips16_addsf3, "__mips16_addsf3"},
- {RTLIB::DIV_F64, RTLIB::__mips16_divdf3, "__mips16_divdf3"},
- {RTLIB::DIV_F32, RTLIB::__mips16_divsf3, "__mips16_divsf3"},
- {RTLIB::OEQ_F64, RTLIB::__mips16_eqdf2, "__mips16_eqdf2"},
- {RTLIB::OEQ_F32, RTLIB::__mips16_eqsf2, "__mips16_eqsf2"},
- {RTLIB::FPEXT_F32_F64, RTLIB::__mips16_extendsfdf2, "__mips16_extendsfdf2"},
- {RTLIB::FPTOSINT_F64_I32, RTLIB::__mips16_fix_truncdfsi,
+ {RTLIB::ADD_F64, RTLIB::impl___mips16_adddf3, "__mips16_adddf3"},
+ {RTLIB::ADD_F32, RTLIB::impl___mips16_addsf3, "__mips16_addsf3"},
+ {RTLIB::DIV_F64, RTLIB::impl___mips16_divdf3, "__mips16_divdf3"},
+ {RTLIB::DIV_F32, RTLIB::impl___mips16_divsf3, "__mips16_divsf3"},
+ {RTLIB::OEQ_F64, RTLIB::impl___mips16_eqdf2, "__mips16_eqdf2"},
+ {RTLIB::OEQ_F32, RTLIB::impl___mips16_eqsf2, "__mips16_eqsf2"},
+ {RTLIB::FPEXT_F32_F64, RTLIB::impl___mips16_extendsfdf2,
+ "__mips16_extendsfdf2"},
+ {RTLIB::FPTOSINT_F64_I32, RTLIB::impl___mips16_fix_truncdfsi,
"__mips16_fix_truncdfsi"},
- {RTLIB::FPTOSINT_F32_I32, RTLIB::__mips16_fix_truncsfsi,
+ {RTLIB::FPTOSINT_F32_I32, RTLIB::impl___mips16_fix_truncsfsi,
"__mips16_fix_truncsfsi"},
- {RTLIB::SINTTOFP_I32_F64, RTLIB::__mips16_floatsidf, "__mips16_floatsidf"},
- {RTLIB::SINTTOFP_I32_F32, RTLIB::__mips16_floatsisf, "__mips16_floatsisf"},
- {RTLIB::UINTTOFP_I32_F64, RTLIB::__mips16_floatunsidf,
+ {RTLIB::SINTTOFP_I32_F64, RTLIB::impl___mips16_floatsidf,
+ "__mips16_floatsidf"},
+ {RTLIB::SINTTOFP_I32_F32, RTLIB::impl___mips16_floatsisf,
+ "__mips16_floatsisf"},
+ {RTLIB::UINTTOFP_I32_F64, RTLIB::impl___mips16_floatunsidf,
"__mips16_floatunsidf"},
- {RTLIB::UINTTOFP_I32_F32, RTLIB::__mips16_floatunsisf,
+ {RTLIB::UINTTOFP_I32_F32, RTLIB::impl___mips16_floatunsisf,
"__mips16_floatunsisf"},
- {RTLIB::OGE_F64, RTLIB::__mips16_gedf2, "__mips16_gedf2"},
- {RTLIB::OGE_F32, RTLIB::__mips16_gesf2, "__mips16_gesf2"},
- {RTLIB::OGT_F64, RTLIB::__mips16_gtdf2, "__mips16_gtdf2"},
- {RTLIB::OGT_F32, RTLIB::__mips16_gtsf2, "__mips16_gtsf2"},
- {RTLIB::OLE_F64, RTLIB::__mips16_ledf2, "__mips16_ledf2"},
- {RTLIB::OLE_F32, RTLIB::__mips16_lesf2, "__mips16_lesf2"},
- {RTLIB::OLT_F64, RTLIB::__mips16_ltdf2, "__mips16_ltdf2"},
- {RTLIB::OLT_F32, RTLIB::__mips16_ltsf2, "__mips16_ltsf2"},
- {RTLIB::MUL_F64, RTLIB::__mips16_muldf3, "__mips16_muldf3"},
- {RTLIB::MUL_F32, RTLIB::__mips16_mulsf3, "__mips16_mulsf3"},
- {RTLIB::UNE_F64, RTLIB::__mips16_nedf2, "__mips16_nedf2"},
- {RTLIB::UNE_F32, RTLIB::__mips16_nesf2, "__mips16_nesf2"},
- {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_dc,
+ {RTLIB::OGE_F64, RTLIB::impl___mips16_gedf2, "__mips16_gedf2"},
+ {RTLIB::OGE_F32, RTLIB::impl___mips16_gesf2, "__mips16_gesf2"},
+ {RTLIB::OGT_F64, RTLIB::impl___mips16_gtdf2, "__mips16_gtdf2"},
+ {RTLIB::OGT_F32, RTLIB::impl___mips16_gtsf2, "__mips16_gtsf2"},
+ {RTLIB::OLE_F64, RTLIB::impl___mips16_ledf2, "__mips16_ledf2"},
+ {RTLIB::OLE_F32, RTLIB::impl___mips16_lesf2, "__mips16_lesf2"},
+ {RTLIB::OLT_F64, RTLIB::impl___mips16_ltdf2, "__mips16_ltdf2"},
+ {RTLIB::OLT_F32, RTLIB::impl___mips16_ltsf2, "__mips16_ltsf2"},
+ {RTLIB::MUL_F64, RTLIB::impl___mips16_muldf3, "__mips16_muldf3"},
+ {RTLIB::MUL_F32, RTLIB::impl___mips16_mulsf3, "__mips16_mulsf3"},
+ {RTLIB::UNE_F64, RTLIB::impl___mips16_nedf2, "__mips16_nedf2"},
+ {RTLIB::UNE_F32, RTLIB::impl___mips16_nesf2, "__mips16_nesf2"},
+ {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_dc,
"__mips16_ret_dc"}, // No associated libcall.
- {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_df,
+ {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_df,
"__mips16_ret_df"}, // No associated libcall.
- {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_sc,
+ {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_sc,
"__mips16_ret_sc"}, // No associated libcall.
- {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_sf,
+ {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_sf,
"__mips16_ret_sf"}, // No associated libcall.
- {RTLIB::SUB_F64, RTLIB::__mips16_subdf3, "__mips16_subdf3"},
- {RTLIB::SUB_F32, RTLIB::__mips16_subsf3, "__mips16_subsf3"},
- {RTLIB::FPROUND_F64_F32, RTLIB::__mips16_truncdfsf2, "__mips16_truncdfsf2"},
- {RTLIB::UO_F64, RTLIB::__mips16_unorddf2, "__mips16_unorddf2"},
- {RTLIB::UO_F32, RTLIB::__mips16_unordsf2, "__mips16_unordsf2"}};
+ {RTLIB::SUB_F64, RTLIB::impl___mips16_subdf3, "__mips16_subdf3"},
+ {RTLIB::SUB_F32, RTLIB::impl___mips16_subsf3, "__mips16_subsf3"},
+ {RTLIB::FPROUND_F64_F32, RTLIB::impl___mips16_truncdfsf2,
+ "__mips16_truncdfsf2"},
+ {RTLIB::UO_F64, RTLIB::impl___mips16_unorddf2, "__mips16_unorddf2"},
+ {RTLIB::UO_F32, RTLIB::impl___mips16_unordsf2, "__mips16_unordsf2"}};
static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = {
{"__fixunsdfsi", "__mips16_call_stub_2" },
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.td b/llvm/lib/Target/Mips/Mips16InstrInfo.td
index fb2a83dc90ea..ab473c133b8e 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -374,8 +374,8 @@ class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
string asmstr, InstrItinClass itin>:
- FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rs),
- !strconcat(asmstr, "\t$rs"), [], itin> ;
+ FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
+ !strconcat(asmstr, "\t$rx"), [], itin> ;
class FRR_SF16_ins
<bits<5> _funct, bits<3> _subfunc,
@@ -776,7 +776,6 @@ def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
}
def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> {
- let rx = 0b000;
let isBranch = 1;
let isIndirectBranch = 1;
let isTerminator=1;
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index ae91c97e2a80..9d8b9f86daf7 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -967,8 +967,7 @@ void MipsAsmPrinter::EmitFPCallStub(
// freed) and since we're at the global level we can use the default
// constructed subtarget.
std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
- TM.getTargetTriple().str(), TM.getTargetCPU(),
- TM.getTargetFeatureString()));
+ TM.getTargetTriple(), TM.getTargetCPU(), TM.getTargetFeatureString()));
//
// .global xxxx
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index 8a59532ba578..bffdffa4af6a 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -40,7 +40,7 @@ using namespace llvm;
void MipsInstrInfo::anchor() {}
MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr)
- : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+ : MipsGenInstrInfo(STI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
Subtarget(STI), UncondBrOpc(UncondBr) {}
const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index ee1ca4538554..f9bdc0993533 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -290,7 +290,8 @@ void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum,
O << ".acq_rel";
return;
case NVPTX::Ordering::SequentiallyConsistent:
- O << ".seq_cst";
+ report_fatal_error(
+ "NVPTX AtomicCode Printer does not support \"seq_cst\" ordering.");
return;
case NVPTX::Ordering::Volatile:
O << ".volatile";
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 8a445f82e700..31c117a8c0fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -80,9 +80,9 @@ class FeaturePTX<int version>:
// + Compare within the family by comparing FullSMVersion, given both belongs to
// the same family.
// + Detect 'a' variants by checking FullSMVersion & 1.
-foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
- 60, 61, 62, 70, 72, 75, 80, 86, 87,
- 89, 90, 100, 101, 103, 120, 121] in {
+foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, 60,
+ 61, 62, 70, 72, 75, 80, 86, 87, 88, 89,
+ 90, 100, 101, 103, 110, 120, 121] in {
// Base SM version (e.g. FullSMVersion for sm_100 is 1000)
def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>;
@@ -127,6 +127,7 @@ def : Proc<"sm_75", [SM75, PTX63]>;
def : Proc<"sm_80", [SM80, PTX70]>;
def : Proc<"sm_86", [SM86, PTX71]>;
def : Proc<"sm_87", [SM87, PTX74]>;
+def : Proc<"sm_88", [SM88, PTX90]>;
def : Proc<"sm_89", [SM89, PTX78]>;
def : Proc<"sm_90", [SM90, PTX78]>;
def : Proc<"sm_90a", [SM90a, PTX80]>;
@@ -139,6 +140,9 @@ def : Proc<"sm_101f", [SM101f, PTX88]>;
def : Proc<"sm_103", [SM103, PTX88]>;
def : Proc<"sm_103a", [SM103a, PTX88]>;
def : Proc<"sm_103f", [SM103f, PTX88]>;
+def : Proc<"sm_110", [SM110, PTX90]>;
+def : Proc<"sm_110a", [SM110a, PTX90]>;
+def : Proc<"sm_110f", [SM110f, PTX90]>;
def : Proc<"sm_120", [SM120, PTX87]>;
def : Proc<"sm_120a", [SM120a, PTX87]>;
def : Proc<"sm_120f", [SM120f, PTX88]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 7391c2d488b5..14ca867023e2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -432,7 +432,7 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// .maxclusterrank directive requires SM_90 or higher, make sure that we
// filter it out for lower SM versions, as it causes a hard ptxas crash.
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
- const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+ const NVPTXSubtarget *STI = &NTM.getSubtarget<NVPTXSubtarget>(F);
if (STI->getSmVersion() >= 90) {
const auto ClusterDim = getClusterDim(F);
@@ -669,7 +669,7 @@ void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) {
// rest of NVPTX isn't friendly to change subtargets per function and
// so the default TargetMachine will have all of the options.
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
- const auto* STI = static_cast<const NVPTXSubtarget*>(NTM.getSubtargetImpl());
+ const NVPTXSubtarget *STI = NTM.getSubtargetImpl();
SmallString<128> Str1;
raw_svector_ostream OS1(Str1);
@@ -680,8 +680,7 @@ void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) {
bool NVPTXAsmPrinter::doInitialization(Module &M) {
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
- const NVPTXSubtarget &STI =
- *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+ const NVPTXSubtarget &STI = *NTM.getSubtargetImpl();
if (M.alias_size() && (STI.getPTXVersion() < 63 || STI.getSmVersion() < 30))
report_fatal_error(".alias requires PTX version >= 6.3 and sm_30");
@@ -716,8 +715,7 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) {
assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
- const NVPTXSubtarget &STI =
- *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+ const NVPTXSubtarget &STI = *NTM.getSubtargetImpl();
// Print out module-level global variables in proper order
for (const GlobalVariable *GV : Globals)
@@ -1178,8 +1176,7 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *F, raw_ostream &O) {
ArrayRef<const GlobalVariable *> GVars = It->second;
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
- const NVPTXSubtarget &STI =
- *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+ const NVPTXSubtarget &STI = *NTM.getSubtargetImpl();
for (const GlobalVariable *GV : GVars) {
O << "\t// demoted variable\n\t";
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 3300ed9a5a81..c70f48af33cf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -170,6 +170,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
}
break;
}
+ case NVPTXISD::ATOMIC_CMP_SWAP_B128:
+ case NVPTXISD::ATOMIC_SWAP_B128:
+ selectAtomicSwap128(N);
+ return;
case ISD::FADD:
case ISD::FMUL:
case ISD::FSUB:
@@ -1097,11 +1101,6 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
if (PlainLoad && PlainLoad->isIndexed())
return false;
- const EVT LoadedEVT = LD->getMemoryVT();
- if (!LoadedEVT.isSimple())
- return false;
- const MVT LoadedVT = LoadedEVT.getSimpleVT();
-
// Address Space Setting
const auto CodeAddrSpace = getAddrSpace(LD);
if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
@@ -1111,7 +1110,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
SDValue Chain = N->getOperand(0);
const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
- const unsigned FromTypeWidth = LoadedVT.getSizeInBits();
+ const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits();
// Vector Setting
const unsigned FromType =
@@ -1165,9 +1164,6 @@ static unsigned getStoreVectorNumElts(SDNode *N) {
bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
MemSDNode *LD = cast<MemSDNode>(N);
- const EVT MemEVT = LD->getMemoryVT();
- if (!MemEVT.isSimple())
- return false;
// Address Space Setting
const auto CodeAddrSpace = getAddrSpace(LD);
@@ -1237,10 +1233,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
- const EVT LoadedEVT = LD->getMemoryVT();
- if (!LoadedEVT.isSimple())
- return false;
-
SDLoc DL(LD);
unsigned ExtensionType;
@@ -1357,10 +1349,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (PlainStore && PlainStore->isIndexed())
return false;
- const EVT StoreVT = ST->getMemoryVT();
- if (!StoreVT.isSimple())
- return false;
-
// Address Space Setting
const auto CodeAddrSpace = getAddrSpace(ST);
@@ -1369,7 +1357,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
// Vector Setting
- const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
+ const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits();
// Create the machine instruction DAG
SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
@@ -1406,8 +1394,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
MemSDNode *ST = cast<MemSDNode>(N);
- const EVT StoreVT = ST->getMemoryVT();
- assert(StoreVT.isSimple() && "Store value is not simple");
+ const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits();
// Address Space Setting
const auto CodeAddrSpace = getAddrSpace(ST);
@@ -1420,10 +1407,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
SDValue Chain = ST->getChain();
const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
- // Type Setting: toType + toTypeWidth
- // - for integer type, always use 'u'
- const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
-
const unsigned NumElts = getStoreVectorNumElts(ST);
SmallVector<SDValue, 16> Ops;
@@ -2337,3 +2320,30 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
}
}
}
+
+void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) {
+ MemSDNode *AN = cast<MemSDNode>(N);
+ SDLoc dl(N);
+
+ const SDValue Chain = N->getOperand(0);
+ const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
+ SmallVector<SDValue, 5> Ops{Base, Offset};
+ Ops.append(N->op_begin() + 2, N->op_end());
+ Ops.append({
+ getI32Imm(getMemOrder(AN), dl),
+ getI32Imm(getAtomicScope(AN), dl),
+ getI32Imm(getAddrSpace(AN), dl),
+ Chain,
+ });
+
+ assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
+ N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
+ unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
+ ? NVPTX::ATOM_EXCH_B128
+ : NVPTX::ATOM_CAS_B128;
+
+ auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops);
+ CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand());
+
+ ReplaceNode(N, ATOM);
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index e2ad55bc1796..8dcd5362c451 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -90,6 +90,7 @@ private:
bool IsIm2Col = false);
void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);
void SelectTcgen05St(SDNode *N, bool hasOffset = false);
+ void selectAtomicSwap128(SDNode *N);
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb4bb1195f78..d3fb657851fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -198,6 +198,12 @@ static bool IsPTXVectorType(MVT VT) {
static std::optional<std::pair<unsigned int, MVT>>
getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
unsigned AddressSpace) {
+ const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
+
+ if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
+ VectorEVT.getSizeInBits() == 256)
+ return {{4, MVT::i64}};
+
if (!VectorEVT.isSimple())
return std::nullopt;
const MVT VectorVT = VectorEVT.getSimpleVT();
@@ -214,8 +220,6 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
// The size of the PTX virtual register that holds a packed type.
unsigned PackRegSize;
- bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
-
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 stores of <2 x double> here
// but I'm leaving that as a TODO for now.
@@ -539,6 +543,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
case ISD::FMINNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
+ case ISD::FMAXIMUMNUM:
+ case ISD::FMINIMUMNUM:
IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
break;
case ISD::FEXP2:
@@ -702,57 +708,66 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// intrinsics.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
- // Turn FP extload into load/fpextend
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
- // Turn FP truncstore into trunc + store.
- // FIXME: vector types should also be expanded
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f16, Expand);
- setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
- setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f32, Expand);
- setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
- setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
+ // FP extload/truncstore is not legal in PTX. We need to expand all these.
+ for (auto FloatVTs :
+ {MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) {
+ for (MVT ValVT : FloatVTs) {
+ for (MVT MemVT : FloatVTs) {
+ setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
+ setTruncStoreAction(ValVT, MemVT, Expand);
+ }
+ }
+ }
- // PTX does not support load / store predicate registers
- setOperationAction(ISD::LOAD, MVT::i1, Custom);
- setOperationAction(ISD::STORE, MVT::i1, Custom);
+ // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+ // how they'll be lowered in ISel anyway, and by doing this a little earlier
+ // we allow for more DAG combine opportunities.
+ for (auto IntVTs :
+ {MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()})
+ for (MVT ValVT : IntVTs)
+ for (MVT MemVT : IntVTs)
+ if (isTypeLegal(ValVT))
+ setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
+ // PTX does not support load / store predicate registers
+ setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
for (MVT VT : MVT::integer_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1,
+ Promote);
setTruncStoreAction(VT, MVT::i1, Expand);
}
+ // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+ // expansion for these nodes when they are unaligned is incorrect if the
+ // type is a vector.
+ //
+ // TODO: Fix the generic expansion for these nodes found in
+ // TargetLowering::expandUnalignedLoad/Store.
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
+ MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+
+ // Register custom handling for illegal type loads/stores. We'll try to custom
+ // lower almost all illegal types and logic in the lowering will discard cases
+ // we can't handle.
+ setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
+ for (MVT VT : MVT::fixedlen_vector_valuetypes())
+ if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
+ setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
+
+ // Custom legalization for LDU intrinsics.
+ // TODO: The logic to lower these is not very robust and we should rewrite it.
+ // Perhaps LDU should not be represented as an intrinsic at all.
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+ for (MVT VT : MVT::fixedlen_vector_valuetypes())
+ if (IsPTXVectorType(VT))
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+
setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE,
ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT,
ISD::SETGE, ISD::SETLE},
MVT::i1, Expand);
- // expand extload of vector of integers.
- setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
- MVT::v2i8, Expand);
- setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
-
// This is legal in NVPTX
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -767,24 +782,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// DEBUGTRAP can be lowered to PTX brkpt
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
- // Register custom handling for vector loads/stores
- for (MVT VT : MVT::fixedlen_vector_valuetypes())
- if (IsPTXVectorType(VT))
- setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT,
- Custom);
-
- setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN},
- {MVT::i128, MVT::f128}, Custom);
-
// Support varargs.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
- // Custom handling for i8 intrinsics
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
{MVT::i16, MVT::i32, MVT::i64}, Legal);
@@ -988,7 +991,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
- for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
+ for (const auto &Op :
+ {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
setOperationAction(Op, MVT::f32, Legal);
setOperationAction(Op, MVT::f64, Legal);
setFP16OperationAction(Op, MVT::f16, Legal, Promote);
@@ -1039,7 +1043,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
- // No FPOW or FREM in PTX.
+
+ // atom.b128 is legal in PTX but since we don't represent i128 as a legal
+ // type, we need to custom lower it.
+ setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
+ Custom);
// Now deduce the information based on the above mentioned
// actions
@@ -1047,7 +1055,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// PTX support for 16-bit CAS is emulated. Only use 32+
setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
- setMaxAtomicSizeInBitsSupported(64);
+ setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
setMaxDivRemBitWidthSupported(64);
// Custom lowering for tcgen05.ld vector operands
@@ -1080,6 +1088,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
case NVPTXISD::FIRST_NUMBER:
break;
+ MAKE_CASE(NVPTXISD::ATOMIC_CMP_SWAP_B128)
+ MAKE_CASE(NVPTXISD::ATOMIC_SWAP_B128)
MAKE_CASE(NVPTXISD::RET_GLUE)
MAKE_CASE(NVPTXISD::DeclareArrayParam)
MAKE_CASE(NVPTXISD::DeclareScalarParam)
@@ -3088,29 +3098,112 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo(SV));
}
-static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results,
- const NVPTXSubtarget &STI);
+/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
+static std::optional<std::pair<SDValue, SDValue>>
+replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ const EVT ResVT = LD->getValueType(0);
+ const EVT MemVT = LD->getMemoryVT();
-SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1)
- return LowerLOADi1(Op, DAG);
+ // If we're doing sign/zero extension as part of the load, avoid lowering to
+ // a LoadV node. TODO: consider relaxing this restriction.
+ if (ResVT != MemVT)
+ return std::nullopt;
- EVT VT = Op.getValueType();
+ const auto NumEltsAndEltVT =
+ getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
+ if (!NumEltsAndEltVT)
+ return std::nullopt;
+ const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
+
+ Align Alignment = LD->getAlign();
+ const auto &TD = DAG.getDataLayout();
+ Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
+ if (Alignment < PrefAlign) {
+ // This load is not sufficiently aligned, so bail out and let this vector
+ // load be scalarized. Note that we may still be able to emit smaller
+ // vector loads. For example, if we are loading a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return std::nullopt;
+ }
+
+ // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // loaded type to i16 and propagate the "real" type as the memory type.
+ const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
+
+ unsigned Opcode;
+ switch (NumElts) {
+ default:
+ return std::nullopt;
+ case 2:
+ Opcode = NVPTXISD::LoadV2;
+ break;
+ case 4:
+ Opcode = NVPTXISD::LoadV4;
+ break;
+ case 8:
+ Opcode = NVPTXISD::LoadV8;
+ break;
+ }
+ auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
+ ListVTs.push_back(MVT::Other);
+ SDVTList LdResVTs = DAG.getVTList(ListVTs);
- if (NVPTX::isPackedVectorTy(VT)) {
- // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
- // handle unaligned loads and have to handle it here.
- LoadSDNode *Load = cast<LoadSDNode>(Op);
- EVT MemVT = Load->getMemoryVT();
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- MemVT, *Load->getMemOperand())) {
- SDValue Ops[2];
- std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
- return DAG.getMergeValues(Ops, SDLoc(Op));
+ SDLoc DL(LD);
+
+ // Copy regular operands
+ SmallVector<SDValue, 8> OtherOps(LD->ops());
+
+ // The select routine does not have access to the LoadSDNode instance, so
+ // pass along the extension information
+ OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
+
+ SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
+ LD->getMemOperand());
+
+ SmallVector<SDValue> ScalarRes;
+ if (EltVT.isVector()) {
+ assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType());
+ assert(NumElts * EltVT.getVectorNumElements() ==
+ ResVT.getVectorNumElements());
+ // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
+ // into individual elements.
+ for (const unsigned I : llvm::seq(NumElts)) {
+ SDValue SubVector = NewLD.getValue(I);
+ DAG.ExtractVectorElements(SubVector, ScalarRes);
+ }
+ } else {
+ for (const unsigned I : llvm::seq(NumElts)) {
+ SDValue Res = NewLD.getValue(I);
+ if (LoadEltVT != EltVT)
+ Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
+ ScalarRes.push_back(Res);
}
}
+ SDValue LoadChain = NewLD.getValue(NumElts);
+
+ const MVT BuildVecVT =
+ MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
+ SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
+ SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
+
+ return {{LoadValue, LoadChain}};
+}
+
+static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results,
+ const NVPTXSubtarget &STI) {
+ if (auto Res = replaceLoadVector(N, DAG, STI))
+ Results.append({Res->first, Res->second});
+}
+
+static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG,
+ const NVPTXSubtarget &STI) {
+ if (auto Res = replaceLoadVector(N, DAG, STI))
+ return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
return SDValue();
}
@@ -3118,13 +3211,10 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// =>
// v1 = ld i8* addr (-> i16)
// v = trunc i16 to i1
-SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
- SDNode *Node = Op.getNode();
- LoadSDNode *LD = cast<LoadSDNode>(Node);
- SDLoc dl(Node);
+static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) {
+ SDLoc dl(LD);
assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
- assert(Node->getValueType(0) == MVT::i1 &&
- "Custom lowering for i1 load only");
+ assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
LD->getBasePtr(), LD->getPointerInfo(),
MVT::i8, LD->getAlign(),
@@ -3133,35 +3223,31 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
// The legalizer (the caller) is expecting two values from the legalized
// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
// in LegalizeDAG.cpp which also uses MergeValues.
- SDValue Ops[] = { result, LD->getChain() };
- return DAG.getMergeValues(Ops, dl);
+ return DAG.getMergeValues({result, LD->getChain()}, dl);
}
-SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
- StoreSDNode *Store = cast<StoreSDNode>(Op);
- EVT VT = Store->getMemoryVT();
-
- if (VT == MVT::i1)
- return LowerSTOREi1(Op, DAG);
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
- // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
- // handle unaligned stores and have to handle it here.
- if (NVPTX::isPackedVectorTy(VT) &&
- !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- VT, *Store->getMemOperand()))
- return expandUnalignedStore(Store, DAG);
+ if (Op.getValueType() == MVT::i1)
+ return lowerLOADi1(LD, DAG);
- // v2f16/v2bf16/v2i16 don't need special handling.
- if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector())
- return SDValue();
+ // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+ // how they'll be lowered in ISel anyway, and by doing this a little earlier
+ // we allow for more DAG combine opportunities.
+ if (LD->getExtensionType() == ISD::EXTLOAD) {
+ assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
+ "Unexpected fpext-load");
+ return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
+ LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
+ LD->getMemOperand());
+ }
- // Lower store of any other vector type, including v2f32 as we want to break
- // it apart since this is not a widely-supported type.
- return LowerSTOREVector(Op, DAG);
+ llvm_unreachable("Unexpected custom lowering for load");
}
-SDValue
-NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG,
+ const NVPTXSubtarget &STI) {
MemSDNode *N = cast<MemSDNode>(Op.getNode());
SDValue Val = N->getOperand(1);
SDLoc DL(N);
@@ -3253,6 +3339,18 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
return NewSt;
}
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ StoreSDNode *Store = cast<StoreSDNode>(Op);
+ EVT VT = Store->getMemoryVT();
+
+ if (VT == MVT::i1)
+ return LowerSTOREi1(Op, DAG);
+
+ // Lower store of any other vector type, including v2f32 as we want to break
+ // it apart since this is not a widely-supported type.
+ return lowerSTOREVector(Op, DAG, STI);
+}
+
// st i1 v, addr
// =>
// v1 = zxt v to i16
@@ -4010,14 +4108,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_p: {
- auto &DL = I.getDataLayout();
Info.opc = ISD::INTRINSIC_W_CHAIN;
- if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
- Info.memVT = getValueType(DL, I.getType());
- else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
- Info.memVT = getPointerTy(DL);
- else
- Info.memVT = getValueType(DL, I.getType());
+ Info.memVT = getValueType(I.getDataLayout(), I.getType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
@@ -5152,11 +5244,34 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
ST->getMemoryVT(), ST->getMemOperand());
}
-static SDValue PerformStoreCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &STI) {
+
+ if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
+ // Here is our chance to custom lower a store with a non-simple type.
+ // Unfortunately, we can't do this in the legalizer because there is no
+ // way to setOperationAction for an non-simple type.
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ if (!ST->getValue().getValueType().isSimple())
+ return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
+ }
+
return combinePackingMovIntoStore(N, DCI, 1, 2);
}
+static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &STI) {
+ if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
+ // Here is our chance to custom lower a load with a non-simple type.
+ // Unfortunately, we can't do this in the legalizer because there is no
+ // way to setOperationAction for an non-simple type.
+ if (!N->getValueType(0).isSimple())
+ return lowerLoadVector(N, DCI.DAG, STI);
+ }
+
+ return combineUnpackingMovIntoLoad(N, DCI);
+}
+
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
///
static SDValue PerformADDCombine(SDNode *N,
@@ -5884,7 +5999,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::LOAD:
case NVPTXISD::LoadV2:
case NVPTXISD::LoadV4:
- return combineUnpackingMovIntoLoad(N, DCI);
+ return combineLOAD(N, DCI, STI);
case ISD::MUL:
return PerformMULCombine(N, DCI, OptLevel);
case NVPTXISD::PRMT:
@@ -5901,7 +6016,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::STORE:
case NVPTXISD::StoreV2:
case NVPTXISD::StoreV4:
- return PerformStoreCombine(N, DCI);
+ return combineSTORE(N, DCI, STI);
case ISD::VSELECT:
return PerformVSELECTCombine(N, DCI);
}
@@ -5930,103 +6045,6 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
}
-/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
-static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results,
- const NVPTXSubtarget &STI) {
- LoadSDNode *LD = cast<LoadSDNode>(N);
- const EVT ResVT = LD->getValueType(0);
- const EVT MemVT = LD->getMemoryVT();
-
- // If we're doing sign/zero extension as part of the load, avoid lowering to
- // a LoadV node. TODO: consider relaxing this restriction.
- if (ResVT != MemVT)
- return;
-
- const auto NumEltsAndEltVT =
- getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
- if (!NumEltsAndEltVT)
- return;
- const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
-
- Align Alignment = LD->getAlign();
- const auto &TD = DAG.getDataLayout();
- Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
- if (Alignment < PrefAlign) {
- // This load is not sufficiently aligned, so bail out and let this vector
- // load be scalarized. Note that we may still be able to emit smaller
- // vector loads. For example, if we are loading a <4 x float> with an
- // alignment of 8, this check will fail but the legalizer will try again
- // with 2 x <2 x float>, which will succeed with an alignment of 8.
- return;
- }
-
- // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // loaded type to i16 and propagate the "real" type as the memory type.
- const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
-
- unsigned Opcode;
- switch (NumElts) {
- default:
- return;
- case 2:
- Opcode = NVPTXISD::LoadV2;
- break;
- case 4:
- Opcode = NVPTXISD::LoadV4;
- break;
- case 8:
- Opcode = NVPTXISD::LoadV8;
- break;
- }
- auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
- ListVTs.push_back(MVT::Other);
- SDVTList LdResVTs = DAG.getVTList(ListVTs);
-
- SDLoc DL(LD);
-
- // Copy regular operands
- SmallVector<SDValue, 8> OtherOps(LD->ops());
-
- // The select routine does not have access to the LoadSDNode instance, so
- // pass along the extension information
- OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
-
- SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
- LD->getMemoryVT(),
- LD->getMemOperand());
-
- SmallVector<SDValue> ScalarRes;
- if (EltVT.isVector()) {
- assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType());
- assert(NumElts * EltVT.getVectorNumElements() ==
- ResVT.getVectorNumElements());
- // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
- // into individual elements.
- for (const unsigned I : llvm::seq(NumElts)) {
- SDValue SubVector = NewLD.getValue(I);
- DAG.ExtractVectorElements(SubVector, ScalarRes);
- }
- } else {
- for (const unsigned I : llvm::seq(NumElts)) {
- SDValue Res = NewLD.getValue(I);
- if (LoadEltVT != EltVT)
- Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
- ScalarRes.push_back(Res);
- }
- }
-
- SDValue LoadChain = NewLD.getValue(NumElts);
-
- const MVT BuildVecVT =
- MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
- SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
- SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
-
- Results.append({LoadValue, LoadChain});
-}
-
// Lower vector return type of tcgen05.ld intrinsics
static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results,
@@ -6262,6 +6280,49 @@ static void replaceProxyReg(SDNode *N, SelectionDAG &DAG,
Results.push_back(Res);
}
+static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG,
+ const NVPTXSubtarget &STI,
+ SmallVectorImpl<SDValue> &Results) {
+ assert(N->getValueType(0) == MVT::i128 &&
+ "Custom lowering for atomic128 only supports i128");
+
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ SDLoc dl(N);
+
+ if (!STI.hasAtomSwap128()) {
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ DAG.getMachineFunction().getFunction(),
+ "Support for b128 atomics introduced in PTX ISA version 8.3 and "
+ "requires target sm_90.",
+ dl.getDebugLoc()));
+
+ Results.push_back(DAG.getUNDEF(MVT::i128));
+ Results.push_back(AN->getOperand(0)); // Chain
+ return;
+ }
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(AN->getOperand(0)); // Chain
+ Ops.push_back(AN->getOperand(1)); // Ptr
+ for (const auto &Op : AN->ops().drop_front(2)) {
+ // Low part
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
+ DAG.getIntPtrConstant(0, dl)));
+ // High part
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
+ DAG.getIntPtrConstant(1, dl)));
+ }
+ unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
+ ? NVPTXISD::ATOMIC_SWAP_B128
+ : NVPTXISD::ATOMIC_CMP_SWAP_B128;
+ SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
+ AN->getMemOperand());
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
+ {Result.getValue(0), Result.getValue(1)}));
+ Results.push_back(Result.getValue(2));
+}
+
void NVPTXTargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -6282,6 +6343,10 @@ void NVPTXTargetLowering::ReplaceNodeResults(
case NVPTXISD::ProxyReg:
replaceProxyReg(N, DAG, *this, Results);
return;
+ case ISD::ATOMIC_CMP_SWAP:
+ case ISD::ATOMIC_SWAP:
+ replaceAtomicSwap128(N, DAG, STI, Results);
+ return;
}
}
@@ -6306,16 +6371,19 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
assert(Ty->isIntegerTy() && "Ty should be integer at this point");
- auto ITy = cast<llvm::IntegerType>(Ty);
+ const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
switch (AI->getOperation()) {
default:
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::BinOp::Xchg:
+ if (BitWidth == 128)
+ return AtomicExpansionKind::None;
+ LLVM_FALLTHROUGH;
case AtomicRMWInst::BinOp::And:
case AtomicRMWInst::BinOp::Or:
case AtomicRMWInst::BinOp::Xor:
- case AtomicRMWInst::BinOp::Xchg:
- switch (ITy->getBitWidth()) {
+ switch (BitWidth) {
case 8:
case 16:
return AtomicExpansionKind::CmpXChg;
@@ -6325,6 +6393,8 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (STI.hasAtomBitwise64())
return AtomicExpansionKind::None;
return AtomicExpansionKind::CmpXChg;
+ case 128:
+ return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");
}
@@ -6334,7 +6404,7 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::BinOp::Min:
case AtomicRMWInst::BinOp::UMax:
case AtomicRMWInst::BinOp::UMin:
- switch (ITy->getBitWidth()) {
+ switch (BitWidth) {
case 8:
case 16:
return AtomicExpansionKind::CmpXChg;
@@ -6344,17 +6414,20 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (STI.hasAtomMinMax64())
return AtomicExpansionKind::None;
return AtomicExpansionKind::CmpXChg;
+ case 128:
+ return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");
}
case AtomicRMWInst::BinOp::UIncWrap:
case AtomicRMWInst::BinOp::UDecWrap:
- switch (ITy->getBitWidth()) {
+ switch (BitWidth) {
case 32:
return AtomicExpansionKind::None;
case 8:
case 16:
case 64:
+ case 128:
return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 27f099e22097..03b3edc902e5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -81,7 +81,17 @@ enum NodeType : unsigned {
CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z,
FIRST_MEMORY_OPCODE,
- LoadV2 = FIRST_MEMORY_OPCODE,
+
+ /// These nodes are used to lower atomic instructions with i128 type. They are
+ /// similar to the generic nodes, but the input and output values are split
+ /// into two 64-bit values.
+ /// ValLo, ValHi, OUTCHAIN = ATOMIC_CMP_SWAP_B128(INCHAIN, ptr, cmpLo, cmpHi,
+ /// swapLo, swapHi)
+ /// ValLo, ValHi, OUTCHAIN = ATOMIC_SWAP_B128(INCHAIN, ptr, amtLo, amtHi)
+ ATOMIC_CMP_SWAP_B128 = FIRST_MEMORY_OPCODE,
+ ATOMIC_SWAP_B128,
+
+ LoadV2,
LoadV4,
LoadV8,
LDUV2, // LDU.v2
@@ -309,11 +319,8 @@ private:
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
-
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 34fe467c9456..6840c7ae8faf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -12,6 +12,7 @@
#include "NVPTXInstrInfo.h"
#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -24,7 +25,8 @@ using namespace llvm;
// Pin the vtable to this file.
void NVPTXInstrInfo::anchor() {}
-NVPTXInstrInfo::NVPTXInstrInfo() : RegInfo() {}
+NVPTXInstrInfo::NVPTXInstrInfo(const NVPTXSubtarget &STI)
+ : NVPTXGenInstrInfo(STI), RegInfo() {}
void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
@@ -190,4 +192,4 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
BuildMI(&MBB, DL, get(NVPTX::CBranch)).add(Cond[0]).addMBB(TBB);
BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
return 2;
-} \ No newline at end of file
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index 4e9dc9d3b468..23889531431e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -21,12 +21,13 @@
#include "NVPTXGenInstrInfo.inc"
namespace llvm {
+class NVPTXSubtarget;
class NVPTXInstrInfo : public NVPTXGenInstrInfo {
const NVPTXRegisterInfo RegInfo;
virtual void anchor();
public:
- explicit NVPTXInstrInfo();
+ explicit NVPTXInstrInfo(const NVPTXSubtarget &STI);
const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 7b135098bd4c..4e38e026e6bd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -104,6 +104,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
+def hasAtomSwap128 : Predicate<"Subtarget->hasAtomSwap128()">;
def hasClusters : Predicate<"Subtarget->hasClusters()">;
def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
@@ -294,7 +295,7 @@ multiclass ADD_SUB_INT_CARRY<string op_str, SDNode op_node, bit commutative> {
//
// Also defines ftz (flush subnormal inputs and results to sign-preserving
// zero) variants for fp32 functions.
-multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
+multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDPatternOperator OpNode> {
defvar nan_str = !if(NaN, ".NaN", "");
if !not(NaN) then {
def _f64_rr :
@@ -898,10 +899,8 @@ let Predicates = [hasOptEnabled] in {
defm MAD_LO_S32 : MADInst<"lo.s32", mul, I32RT, I32RT>;
defm MAD_LO_S64 : MADInst<"lo.s64", mul, I64RT, I64RT>;
- defm MAD_WIDE_U16 : MADInst<"wide.u16", umul_wide, I32RT, I16RT>;
- defm MAD_WIDE_S16 : MADInst<"wide.s16", smul_wide, I32RT, I16RT>;
- defm MAD_WIDE_U32 : MADInst<"wide.u32", umul_wide, I64RT, I32RT>;
- defm MAD_WIDE_S32 : MADInst<"wide.s32", smul_wide, I64RT, I32RT>;
+ // Generating mad.wide causes a regression:
+ // https://github.com/llvm/llvm-project/pull/150477#issuecomment-3191367837
}
//-----------------------------------
@@ -912,8 +911,15 @@ defm FADD : F3_fma_component<"add", fadd>;
defm FSUB : F3_fma_component<"sub", fsub>;
defm FMUL : F3_fma_component<"mul", fmul>;
-defm MIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>;
-defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
+def fminnum_or_fminimumnum : PatFrags<(ops node:$a, node:$b),
+ [(fminnum node:$a, node:$b),
+ (fminimumnum node:$a, node:$b)]>;
+def fmaxnum_or_fmaximumnum : PatFrags<(ops node:$a, node:$b),
+ [(fmaxnum node:$a, node:$b),
+ (fmaximumnum node:$a, node:$b)]>;
+
+defm MIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum_or_fminimumnum>;
+defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum_or_fmaximumnum>;
defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>;
defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 4ab30a5b5f5e..c544911bdf1e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1990,19 +1990,23 @@ multiclass F_ATOMIC_3<RegTyInfo t, string op_str, SDPatternOperator op, SDNode a
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def _rr : BasicFlagsNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.RC:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ (ins ADDR:$addr, t.RC:$b, t.RC:$c),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
asm_str>;
def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ (ins ADDR:$addr, t.Imm:$b, t.RC:$c),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
asm_str>;
def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ (ins ADDR:$addr, t.RC:$b, t.Imm:$c),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
asm_str>;
def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst),
- (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ (ins ADDR:$addr, t.Imm:$b, t.Imm:$c),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
asm_str>;
}
@@ -2200,6 +2204,37 @@ defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">;
defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">;
defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
+// atom.*.b128
+
+let mayLoad = true, mayStore = true, hasSideEffects = true,
+ Predicates = [hasAtomSwap128] in {
+ def ATOM_CAS_B128 :
+ NVPTXInst<
+ (outs B64:$dst0, B64:$dst1),
+ (ins ADDR:$addr, B64:$cmp0, B64:$cmp1, B64:$swap0, B64:$swap1,
+ AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ "{{\n\t"
+ ".reg .b128 cmp, swap, dst;\n\t"
+ "mov.b128 cmp, {$cmp0, $cmp1};\n\t"
+ "mov.b128 swap, {$swap0, $swap1};\n\t"
+ "atom${sem:sem}${scope:scope}${addsp:addsp}.cas.b128 dst, [$addr], cmp, swap;\n\t"
+ "mov.b128 {$dst0, $dst1}, dst;\n\t"
+ "}}">;
+
+ def ATOM_EXCH_B128 :
+ NVPTXInst<
+ (outs B64:$dst0, B64:$dst1),
+ (ins ADDR:$addr, B64:$amt0, B64:$amt1,
+ AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ "{{\n\t"
+ ".reg .b128 amt, dst;\n\t"
+ "mov.b128 amt, {$amt0, $amt1};\n\t"
+ "atom${sem:sem}${scope:scope}${addsp:addsp}.exch.b128 dst, [$addr], amt;\n\t"
+ "mov.b128 {$dst0, $dst1}, dst;\n\t"
+ "}}">;
+}
+
+
//-----------------------------------
// Support for ldu on sm_20 or later
//-----------------------------------
@@ -4358,10 +4393,12 @@ let hasSideEffects = 1 in {
def SREG_CLOCK : PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
def SREG_CLOCK64 : PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
def SREG_GLOBALTIMER : PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>;
+ def SREG_GLOBALTIMER_LO : PTX_READ_SREG_R32<"globaltimer_lo", int_nvvm_read_ptx_sreg_globaltimer_lo>;
}
def: Pat <(i64 (readcyclecounter)), (SREG_CLOCK64)>;
def: Pat <(i64 (readsteadycounter)), (SREG_GLOBALTIMER)>;
+def: Pat <(i32 (readsteadycounter)), (SREG_GLOBALTIMER_LO)>;
def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index a84ceaba991c..c5489670bd24 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -62,7 +62,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
const NVPTXTargetMachine &TM)
: NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
FullSmVersion(200), SmVersion(getSmVersion()),
- TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {
TSInfo = std::make_unique<NVPTXSelectionDAGInfo>();
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index acf025b70ce3..0a77a633cb25 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -82,6 +82,7 @@ public:
bool hasAtomBitwise64() const { return SmVersion >= 32; }
bool hasAtomMinMax64() const { return SmVersion >= 32; }
bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
+ bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
bool hasLDG() const { return SmVersion >= 32; }
bool hasHWROT32() const { return SmVersion >= 32; }
@@ -105,6 +106,7 @@ public:
// Tcgen05 instructions in Blackwell family
bool hasTcgen05Instructions() const {
bool HasTcgen05 = false;
+ unsigned MinPTXVersion = 86;
switch (FullSmVersion) {
default:
break;
@@ -112,9 +114,13 @@ public:
case 1013: // sm_101a
HasTcgen05 = true;
break;
+ case 1033: // sm_103a
+ HasTcgen05 = true;
+ MinPTXVersion = 88;
+ break;
}
- return HasTcgen05 && PTXVersion >= 86;
+ return HasTcgen05 && PTXVersion >= MinPTXVersion;
}
// f32x2 instructions in Blackwell family
bool hasF32x2Instructions() const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 0603994606d7..833f014a4c87 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -126,12 +126,12 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
// (addrspace:3).
if (!is64Bit)
Ret += "-p:32:32-p6:32:32-p7:32:32";
- else if (UseShortPointers) {
+ else if (UseShortPointers)
Ret += "-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32";
- } else
+ else
Ret += "-p6:32:32";
- Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
+ Ret += "-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64";
return Ret;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 274b04fdd30b..8e97b422218f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -55,15 +55,6 @@ void clearAnnotationCache(const Module *Mod) {
AC.Cache.erase(Mod);
}
-static void readIntVecFromMDNode(const MDNode *MetadataNode,
- std::vector<unsigned> &Vec) {
- for (unsigned i = 0, e = MetadataNode->getNumOperands(); i != e; ++i) {
- ConstantInt *Val =
- mdconst::extract<ConstantInt>(MetadataNode->getOperand(i));
- Vec.push_back(Val->getZExtValue());
- }
-}
-
static void cacheAnnotationFromMD(const MDNode *MetadataNode,
key_val_pair_t &retval) {
auto &AC = getAnnotationCache();
@@ -83,19 +74,8 @@ static void cacheAnnotationFromMD(const MDNode *MetadataNode,
if (ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(
MetadataNode->getOperand(i + 1))) {
retval[Key].push_back(Val->getZExtValue());
- } else if (MDNode *VecMd =
- dyn_cast<MDNode>(MetadataNode->getOperand(i + 1))) {
- // note: only "grid_constant" annotations support vector MDNodes.
- // assert: there can only exist one unique key value pair of
- // the form (string key, MDNode node). Operands of such a node
- // shall always be unsigned ints.
- auto [It, Inserted] = retval.try_emplace(Key);
- if (Inserted) {
- readIntVecFromMDNode(VecMd, It->second);
- continue;
- }
} else {
- llvm_unreachable("Value operand not a constant int or an mdnode");
+ llvm_unreachable("Value operand not a constant int");
}
}
}
@@ -179,16 +159,13 @@ static bool globalHasNVVMAnnotation(const Value &V, const std::string &Prop) {
}
static bool argHasNVVMAnnotation(const Value &Val,
- const std::string &Annotation,
- const bool StartArgIndexAtOne = false) {
+ const std::string &Annotation) {
if (const Argument *Arg = dyn_cast<Argument>(&Val)) {
const Function *Func = Arg->getParent();
std::vector<unsigned> Annot;
if (findAllNVVMAnnotation(Func, Annotation, Annot)) {
- const unsigned BaseOffset = StartArgIndexAtOne ? 1 : 0;
- if (is_contained(Annot, BaseOffset + Arg->getArgNo())) {
+ if (is_contained(Annot, Arg->getArgNo()))
return true;
- }
}
}
return false;
@@ -250,8 +227,7 @@ bool isParamGridConstant(const Argument &Arg) {
}
// "grid_constant" counts argument indices starting from 1
- if (argHasNVVMAnnotation(Arg, "grid_constant",
- /*StartArgIndexAtOne*/ true))
+ if (Arg.hasAttribute("nvvm.grid_constant"))
return true;
return false;
diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index 1e39f01fd7aa..2182039e0eef 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -49,7 +49,7 @@ add_llvm_target(PowerPCCodeGen
PPCTargetTransformInfo.cpp
PPCTOCRegDeps.cpp
PPCTLSDynamicCall.cpp
- PPCVSXCopy.cpp
+ PPCVSXWACCCopy.cpp
PPCReduceCRLogicals.cpp
PPCVSXFMAMutate.cpp
PPCVSXSwapRemoval.cpp
diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h
index 124dac458431..a7cd5cde16b4 100644
--- a/llvm/lib/Target/PowerPC/PPC.h
+++ b/llvm/lib/Target/PowerPC/PPC.h
@@ -39,7 +39,7 @@ class ModulePass;
FunctionPass *createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM);
FunctionPass *createPPCTOCRegDepsPass();
FunctionPass *createPPCEarlyReturnPass();
- FunctionPass *createPPCVSXCopyPass();
+ FunctionPass *createPPCVSXWACCCopyPass();
FunctionPass *createPPCVSXFMAMutatePass();
FunctionPass *createPPCVSXSwapRemovalPass();
FunctionPass *createPPCReduceCRLogicalsPass();
@@ -64,7 +64,7 @@ class ModulePass;
void initializePPCLoopInstrFormPrepPass(PassRegistry&);
void initializePPCTOCRegDepsPass(PassRegistry&);
void initializePPCEarlyReturnPass(PassRegistry&);
- void initializePPCVSXCopyPass(PassRegistry&);
+ void initializePPCVSXWACCCopyPass(PassRegistry &);
void initializePPCVSXFMAMutatePass(PassRegistry&);
void initializePPCVSXSwapRemovalPass(PassRegistry&);
void initializePPCReduceCRLogicalsPass(PassRegistry&);
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 2ab2c147be0e..023fd147535e 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -920,10 +920,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
assert(!Subtarget->isAIXABI() &&
"AIX does not support patchable function entry!");
- // PATCHABLE_FUNCTION_ENTER on little endian is for XRAY support which is
- // handled in PPCLinuxAsmPrinter.
- if (MAI->isLittleEndian())
- return;
const Function &F = MF->getFunction();
unsigned Num = 0;
(void)F.getFnAttribute("patchable-function-entry")
@@ -1789,7 +1785,13 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
// Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
// of instructions change.
// XRAY is only supported on PPC Linux little endian.
- if (!MAI->isLittleEndian())
+ const Function &F = MF->getFunction();
+ unsigned Num = 0;
+ (void)F.getFnAttribute("patchable-function-entry")
+ .getValueAsString()
+ .getAsInteger(10, Num);
+
+ if (!MAI->isLittleEndian() || Num)
break;
MCSymbol *BeginOfSled = OutContext.createTempSymbol();
MCSymbol *EndOfSled = OutContext.createTempSymbol();
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 7022e9e9dae9..fa104e4f69d7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1693,6 +1693,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::XXPERM:
return "PPCISD::XXPERM";
case PPCISD::VECSHL: return "PPCISD::VECSHL";
+ case PPCISD::VSRQ:
+ return "PPCISD::VSRQ";
case PPCISD::CMPB: return "PPCISD::CMPB";
case PPCISD::Hi: return "PPCISD::Hi";
case PPCISD::Lo: return "PPCISD::Lo";
@@ -2696,7 +2698,7 @@ bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
if (!isa<ConstantSDNode>(N))
return false;
- Imm = (int64_t)cast<ConstantSDNode>(N)->getSExtValue();
+ Imm = cast<ConstantSDNode>(N)->getSExtValue();
return isInt<34>(Imm);
}
bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
@@ -11274,6 +11276,24 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMergeValues(RetOps, dl);
}
+ case Intrinsic::ppc_mma_build_dmr: {
+ SmallVector<SDValue, 8> Pairs;
+ SmallVector<SDValue, 8> Chains;
+ for (int i = 1; i < 9; i += 2) {
+ SDValue Hi = Op.getOperand(i);
+ SDValue Lo = Op.getOperand(i + 1);
+ if (Hi->getOpcode() == ISD::LOAD)
+ Chains.push_back(Hi.getValue(1));
+ if (Lo->getOpcode() == ISD::LOAD)
+ Chains.push_back(Lo.getValue(1));
+ Pairs.push_back(
+ DAG.getNode(PPCISD::PAIR_BUILD, dl, MVT::v256i1, {Hi, Lo}));
+ }
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ SDValue Value = DMFInsert1024(Pairs, SDLoc(Op), DAG);
+ return DAG.getMergeValues({Value, TF}, dl);
+ }
+
case Intrinsic::ppc_mma_dmxxextfdmr512: {
assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
@@ -11610,6 +11630,10 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(0)),
0);
}
+ case Intrinsic::ppc_mma_disassemble_dmr: {
+ return DAG.getStore(DAG.getEntryNode(), DL, Op.getOperand(ArgStart + 2),
+ Op.getOperand(ArgStart + 1), MachinePointerInfo());
+ }
default:
break;
}
@@ -12099,6 +12123,24 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
return DAG.getMergeValues({DmrPValue, TF}, dl);
}
+SDValue PPCTargetLowering::DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
+ const SDLoc &dl,
+ SelectionDAG &DAG) const {
+ SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTDMR512, dl, MVT::v512i1, Pairs[0],
+ Pairs[1]),
+ 0);
+ SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
+ SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTDMR512_HI, dl, MVT::v512i1,
+ Pairs[2], Pairs[3]),
+ 0);
+ SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
+ SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
+
+ return SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1,
+ {RC, Lo, LoSub, Hi, HiSub}),
+ 0);
+}
+
SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 559d58309692..669430550f4e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -498,6 +498,9 @@ namespace llvm {
/// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
SETBCR,
+ /// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
+ VSRQ,
+
// NOTE: The nodes below may require PC-Rel specific patterns if the
// address could be PC-Relative. When adding new nodes below, consider
// whether or not the address can be PC-Relative and add the corresponding
@@ -1345,6 +1348,8 @@ namespace llvm {
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const;
+ SDValue DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
+ const SDLoc &dl, SelectionDAG &DAG) const;
SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
CallingConv::ID CallConv, bool isVarArg,
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index fd2084398c85..269d30318bca 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1095,8 +1095,7 @@ let hasSideEffects = 0 in {
defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$RA),
(ins g8rc:$RAi, g8rc:$RS, u6imm:$SH, u6imm:$MBE),
"rldimi", "$RA, $RS, $SH, $MBE", IIC_IntRotateDI,
- []>, isPPC64, RegConstraint<"$RAi = $RA">,
- NoEncode<"$RAi">;
+ []>, isPPC64, RegConstraint<"$RAi = $RA">;
// Rotate instructions.
defm RLDCL : MDSForm_1r<30, 8,
@@ -1156,7 +1155,7 @@ defm RLWIMI8 : MForm_2r<20, (outs g8rc:$RA),
(ins g8rc:$RAi, g8rc:$RS, u5imm:$SH, u5imm:$MB,
u5imm:$ME), "rlwimi", "$RA, $RS, $SH, $MB, $ME",
IIC_IntRotate, []>, PPC970_DGroup_Cracked,
- RegConstraint<"$RAi = $RA">, NoEncode<"$RAi">;
+ RegConstraint<"$RAi = $RA">;
let isSelect = 1 in
def ISEL8 : AForm_4<31, 15,
@@ -1313,21 +1312,18 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
def LHAU8 : DForm_1<43, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memri $D, $RA):$addr),
"lhau $RST, $addr", IIC_LdStLHAU,
- []>, RegConstraint<"$addr.reg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">;
// NO LWAU!
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
def LHAUX8 : XForm_1_memOp<31, 375, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lhaux $RST, $addr", IIC_LdStLHAUX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
def LWAUX : XForm_1_memOp<31, 373, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lwaux $RST, $addr", IIC_LdStLHAUX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">, isPPC64;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">, isPPC64;
}
}
@@ -1366,34 +1362,28 @@ let mayLoad = 1, hasSideEffects = 0 in {
def LBZU8 : DForm_1<35, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memri $D, $RA):$addr),
"lbzu $RST, $addr", IIC_LdStLoadUpd,
- []>, RegConstraint<"$addr.reg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">;
def LHZU8 : DForm_1<41, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memri $D, $RA):$addr),
"lhzu $RST, $addr", IIC_LdStLoadUpd,
- []>, RegConstraint<"$addr.reg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">;
def LWZU8 : DForm_1<33, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memri $D, $RA):$addr),
"lwzu $RST, $addr", IIC_LdStLoadUpd,
- []>, RegConstraint<"$addr.reg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">;
def LBZUX8 : XForm_1_memOp<31, 119, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lbzux $RST, $addr", IIC_LdStLoadUpdX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
def LHZUX8 : XForm_1_memOp<31, 311, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lhzux $RST, $addr", IIC_LdStLoadUpdX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lwzux $RST, $addr", IIC_LdStLoadUpdX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
}
}
} // Interpretation64Bit
@@ -1445,14 +1435,12 @@ let mayLoad = 1, hasSideEffects = 0 in {
def LDU : DSForm_1<58, 1, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrix $D, $RA):$addr),
"ldu $RST, $addr", IIC_LdStLDU,
- []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64;
def LDUX : XForm_1_memOp<31, 53, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"ldux $RST, $addr", IIC_LdStLDUX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">, isPPC64;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">, isPPC64;
}
let mayLoad = 1, hasNoSchedulingInfo = 1 in {
@@ -1718,45 +1706,41 @@ let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memri $D, $RA):$addr),
"stbu $RST, $addr", IIC_LdStSTU, []>,
- RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">;
+ RegConstraint<"$addr.reg = $ea_res">;
def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memri $D, $RA):$addr),
"sthu $RST, $addr", IIC_LdStSTU, []>,
- RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">;
+ RegConstraint<"$addr.reg = $ea_res">;
def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memri $D, $RA):$addr),
"stwu $RST, $addr", IIC_LdStSTU, []>,
- RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">;
+ RegConstraint<"$addr.reg = $ea_res">;
def STBUX8: XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
(ins g8rc:$RST, (memrr $RA, $RB):$addr),
"stbux $RST, $addr", IIC_LdStSTUX, []>,
RegConstraint<"$addr.ptrreg = $ea_res">,
- NoEncode<"$ea_res">,
PPC970_DGroup_Cracked;
def STHUX8: XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
(ins g8rc:$RST, (memrr $RA, $RB):$addr),
"sthux $RST, $addr", IIC_LdStSTUX, []>,
RegConstraint<"$addr.ptrreg = $ea_res">,
- NoEncode<"$ea_res">,
PPC970_DGroup_Cracked;
def STWUX8: XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
(ins g8rc:$RST, (memrr $RA, $RB):$addr),
"stwux $RST, $addr", IIC_LdStSTUX, []>,
RegConstraint<"$addr.ptrreg = $ea_res">,
- NoEncode<"$ea_res">,
PPC970_DGroup_Cracked;
} // Interpretation64Bit
def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res),
(ins g8rc:$RST, (memrix $D, $RA):$addr),
"stdu $RST, $addr", IIC_LdStSTU, []>,
- RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">,
+ RegConstraint<"$addr.reg = $ea_res">,
isPPC64;
def STDUX : XForm_8_memOp<31, 181, (outs ptr_rc_nor0:$ea_res),
(ins g8rc:$RST, (memrr $RA, $RB):$addr),
"stdux $RST, $addr", IIC_LdStSTUX, []>,
RegConstraint<"$addr.ptrreg = $ea_res">,
- NoEncode<"$ea_res">,
PPC970_DGroup_Cracked, isPPC64;
}
@@ -2000,7 +1984,7 @@ def : Pat<(int_ppc_darnraw), (DARN 2)>;
class X_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty,
InstrItinClass itin, list<dag> pattern>
- : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$RA, ty:$RB, u1imm:$L),
+ : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$RA, ty:$RB),
!strconcat(opc, " $RA, $RB"), itin, pattern>{
let L = 1;
}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 79fe12e8e4b4..97d5e2896323 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -261,6 +261,13 @@ def immEQOneV : PatLeaf<(build_vector), [{
return C->isOne();
return false;
}]>;
+
+def VSRVSRO : PatFrag<(ops node:$input, node:$shift),
+ (int_ppc_altivec_vsr
+ (int_ppc_altivec_vsro node:$input, node:$shift),
+ node:$shift),
+ [{ return N->getOperand(1).hasOneUse(); }]>;
+
//===----------------------------------------------------------------------===//
// Helpers for defining instructions that directly correspond to intrinsics.
@@ -1471,13 +1478,13 @@ def VINSERTB : VXForm_1<781, (outs vrrc:$VD),
"vinsertb $VD, $VB, $VA", IIC_VecGeneral,
[(set v16i8:$VD, (PPCvecinsert v16i8:$VDi, v16i8:$VB,
imm32SExt16:$VA))]>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
def VINSERTH : VXForm_1<845, (outs vrrc:$VD),
(ins vrrc:$VDi, u4imm:$VA, vrrc:$VB),
"vinserth $VD, $VB, $VA", IIC_VecGeneral,
[(set v8i16:$VD, (PPCvecinsert v8i16:$VDi, v8i16:$VB,
imm32SExt16:$VA))]>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>;
def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;
@@ -1569,7 +1576,7 @@ def VRLWMI : VXForm_1<133, (outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, vrrc:$VDi),
[(set v4i32:$VD,
(int_ppc_altivec_vrlwmi v4i32:$VA, v4i32:$VB,
v4i32:$VDi))]>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
def VRLDNM : VX1_VT5_VA5_VB5<453, "vrldnm",
[(set v2i64:$VD,
(int_ppc_altivec_vrldnm v2i64:$VA,
@@ -1579,7 +1586,7 @@ def VRLDMI : VXForm_1<197, (outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, vrrc:$VDi),
[(set v2i64:$VD,
(int_ppc_altivec_vrldmi v2i64:$VA, v2i64:$VB,
v2i64:$VDi))]>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
// Vector Shift Left/Right
def VSLV : VX1_VT5_VA5_VB5<1860, "vslv",
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index b4b475b470a5..fba1c6609dba 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -18,7 +18,7 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
bit PPC64 = 0; // Default value, override with isPPC64
let Namespace = "PPC";
- let Inst{0-5} = opcode;
+ let Inst{0...5} = opcode;
let OutOperandList = OOL;
let InOperandList = IOL;
let AsmString = asmstr;
@@ -34,7 +34,7 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
let TSFlags{0} = PPC970_First;
let TSFlags{1} = PPC970_Single;
let TSFlags{2} = PPC970_Cracked;
- let TSFlags{5-3} = PPC970_Unit;
+ let TSFlags{5...3} = PPC970_Unit;
// Indicate that this instruction is of type X-Form Load or Store
bits<1> XFormMemOp = 0;
@@ -99,8 +99,8 @@ class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
bit PPC64 = 0; // Default value, override with isPPC64
let Namespace = "PPC";
- let Inst{0-5} = opcode1;
- let Inst{32-37} = opcode2;
+ let Inst{0...5} = opcode1;
+ let Inst{32...37} = opcode2;
let OutOperandList = OOL;
let InOperandList = IOL;
let AsmString = asmstr;
@@ -116,7 +116,7 @@ class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
let TSFlags{0} = PPC970_First;
let TSFlags{1} = PPC970_Single;
let TSFlags{2} = PPC970_Cracked;
- let TSFlags{5-3} = PPC970_Unit;
+ let TSFlags{5...3} = PPC970_Unit;
// Fields used for relation models.
string BaseName = "";
@@ -135,7 +135,7 @@ class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
bits<24> LI;
- let Inst{6-29} = LI;
+ let Inst{6...29} = LI;
let Inst{30} = aa;
let Inst{31} = lk;
}
@@ -148,12 +148,12 @@ class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
bits<14> BD;
bits<5> BI;
- let BI{0-1} = BIBO{5-6};
- let BI{2-4} = CR{0-2};
+ let BI{0...1} = BIBO{5...6};
+ let BI{2...4} = CR{0...2};
- let Inst{6-10} = BIBO{4-0};
- let Inst{11-15} = BI;
- let Inst{16-29} = BD;
+ let Inst{6...10} = BIBO{4...0};
+ let Inst{11...15} = BI;
+ let Inst{16...29} = BD;
let Inst{30} = aa;
let Inst{31} = lk;
}
@@ -161,8 +161,8 @@ class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
string asmstr>
: BForm<opcode, aa, lk, OOL, IOL, asmstr> {
- let BIBO{4-0} = bo;
- let BIBO{6-5} = 0;
+ let BIBO{4...0} = bo;
+ let BIBO{6...5} = 0;
let CR = 0;
}
@@ -171,9 +171,9 @@ class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
: I<opcode, OOL, IOL, asmstr, IIC_BrB> {
bits<14> BD;
- let Inst{6-10} = bo;
- let Inst{11-15} = bi;
- let Inst{16-29} = BD;
+ let Inst{6...10} = bo;
+ let Inst{11...15} = bi;
+ let Inst{16...29} = BD;
let Inst{30} = aa;
let Inst{31} = lk;
}
@@ -185,9 +185,9 @@ class BForm_3<bits<6> opcode, bit aa, bit lk,
bits<5> BI;
bits<14> BD;
- let Inst{6-10} = BO;
- let Inst{11-15} = BI;
- let Inst{16-29} = BD;
+ let Inst{6...10} = BO;
+ let Inst{11...15} = BI;
+ let Inst{16...29} = BD;
let Inst{30} = aa;
let Inst{31} = lk;
}
@@ -200,10 +200,10 @@ class BForm_3_at<bits<6> opcode, bit aa, bit lk,
bits<5> BI;
bits<14> BD;
- let Inst{6-8} = BO{4-2};
- let Inst{9-10} = at;
- let Inst{11-15} = BI;
- let Inst{16-29} = BD;
+ let Inst{6...8} = BO{4...2};
+ let Inst{9...10} = at;
+ let Inst{11...15} = BI;
+ let Inst{16...29} = BD;
let Inst{30} = aa;
let Inst{31} = lk;
}
@@ -215,9 +215,9 @@ BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk,
bits<5> BI;
bits<14> BD;
- let Inst{6-10} = bo;
- let Inst{11-15} = BI;
- let Inst{16-29} = BD;
+ let Inst{6...10} = bo;
+ let Inst{11...15} = BI;
+ let Inst{16...29} = BD;
let Inst{30} = aa;
let Inst{31} = lk;
}
@@ -231,7 +231,7 @@ class SCForm<bits<6> opcode, bits<1> xo1, bits<1> xo2,
let Pattern = pattern;
- let Inst{20-26} = LEV;
+ let Inst{20...26} = LEV;
let Inst{30} = xo1;
let Inst{31} = xo2;
}
@@ -246,9 +246,9 @@ class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RST;
- let Inst{11-15} = RA;
- let Inst{16-31} = D;
+ let Inst{6...10} = RST;
+ let Inst{11...15} = RA;
+ let Inst{16...31} = D;
}
class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -273,9 +273,9 @@ class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RST;
- let Inst{11-15} = 0;
- let Inst{16-31} = D;
+ let Inst{6...10} = RST;
+ let Inst{11...15} = 0;
+ let Inst{16...31} = D;
}
class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -287,9 +287,9 @@ class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RST;
- let Inst{11-15} = RA;
- let Inst{16-31} = D;
+ let Inst{6...10} = RST;
+ let Inst{11...15} = RA;
+ let Inst{16...31} = D;
}
class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -321,13 +321,13 @@ class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
let Pattern = pattern;
bits<24> LI;
- let Inst{6-29} = LI;
+ let Inst{6...29} = LI;
let Inst{30} = aa;
let Inst{31} = lk;
- let Inst{38-42} = RST;
- let Inst{43-47} = RA;
- let Inst{48-63} = D;
+ let Inst{38...42} = RST;
+ let Inst{43...47} = RA;
+ let Inst{48...63} = D;
}
// This is used to emit BL8+NOP.
@@ -349,11 +349,11 @@ class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
bits<5> RA;
bits<16> D;
- let Inst{6-8} = BF;
+ let Inst{6...8} = BF;
let Inst{9} = 0;
let Inst{10} = L;
- let Inst{11-15} = RA;
- let Inst{16-31} = D;
+ let Inst{11...15} = RA;
+ let Inst{16...31} = D;
}
class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -383,10 +383,10 @@ class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RST;
- let Inst{11-15} = RA;
- let Inst{16-29} = D;
- let Inst{30-31} = xo;
+ let Inst{6...10} = RST;
+ let Inst{11...15} = RA;
+ let Inst{16...29} = D;
+ let Inst{30...31} = xo;
}
// ISA V3.0B 1.6.6 DX-Form
@@ -398,10 +398,10 @@ class DXForm<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = D{5-1}; // d1
- let Inst{16-25} = D{15-6}; // d0
- let Inst{26-30} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = D{5...1}; // d1
+ let Inst{16...25} = D{15...6}; // d0
+ let Inst{26...30} = xo;
let Inst{31} = D{0}; // d2
}
@@ -415,11 +415,11 @@ class DQ_RD6_RS5_DQ12<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = RA;
- let Inst{16-27} = DQ;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = RA;
+ let Inst{16...27} = DQ;
let Inst{28} = XT{5};
- let Inst{29-31} = xo;
+ let Inst{29...31} = xo;
}
class DQForm_RTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
@@ -431,10 +431,10 @@ class DQForm_RTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
bits<12> DQ;
let Pattern = pattern;
- let Inst{6-10} = RTp{4-0};
- let Inst{11-15} = RA;
- let Inst{16-27} = DQ;
- let Inst{28-31} = xo;
+ let Inst{6...10} = RTp{4...0};
+ let Inst{11...15} = RA;
+ let Inst{16...27} = DQ;
+ let Inst{28...31} = xo;
}
// 1.7.6 X-Form
@@ -449,10 +449,10 @@ class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asms
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = RST;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RST;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -475,7 +475,7 @@ class XForm_tlbilx<bits<10> xo, dag OOL, dag IOL, string asmstr,
class XForm_attn<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin>
: I<opcode, OOL, IOL, asmstr, itin> {
- let Inst{21-30} = xo;
+ let Inst{21...30} = xo;
}
// This is the same as XForm_base_r3xo, but the first two operands are swapped
@@ -490,10 +490,10 @@ class XForm_base_r3xo_swapped
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = RST;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RST;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -528,10 +528,10 @@ class XForm_tlbws<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RST;
- let Inst{11-15} = RA;
+ let Inst{6...10} = RST;
+ let Inst{11...15} = RA;
let Inst{20} = WS;
- let Inst{21-30} = xo;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -570,12 +570,12 @@ class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RA;
bits<5> RB;
- let Inst{6-8} = BF;
+ let Inst{6...8} = BF;
let Inst{9} = 0;
let Inst{10} = L;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -587,10 +587,10 @@ class XForm_icbt<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RB;
let Inst{6} = 0;
- let Inst{7-10} = CT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{7...10} = CT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -600,9 +600,9 @@ class XForm_sr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RS;
bits<4> SR;
- let Inst{6-10} = RS;
- let Inst{12-15} = SR;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RS;
+ let Inst{12...15} = SR;
+ let Inst{21...30} = xo;
}
class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -610,8 +610,8 @@ class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
: I<opcode, OOL, IOL, asmstr, itin> {
bits<5> MO;
- let Inst{6-10} = MO;
- let Inst{21-30} = xo;
+ let Inst{6...10} = MO;
+ let Inst{21...30} = xo;
}
class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -620,9 +620,9 @@ class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RS;
bits<5> RB;
- let Inst{6-10} = RS;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RS;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
}
class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -631,9 +631,9 @@ class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RS;
bits<1> L;
- let Inst{6-10} = RS;
+ let Inst{6...10} = RS;
let Inst{15} = L;
- let Inst{21-30} = xo;
+ let Inst{21...30} = xo;
}
class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -649,11 +649,11 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RA;
bits<5> RB;
- let Inst{6-8} = BF;
- let Inst{9-10} = 0;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{6...8} = BF;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -673,10 +673,10 @@ class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = FRT;
- let Inst{11-15} = FRA;
- let Inst{16-20} = FRB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = FRT;
+ let Inst{11...15} = FRA;
+ let Inst{16...20} = FRB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -696,11 +696,11 @@ class XForm_20<bits<6> opcode, bits<6> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = FRT;
- let Inst{11-15} = FRA;
- let Inst{16-20} = FRB;
- let Inst{21-24} = tttt;
- let Inst{25-30} = xo;
+ let Inst{6...10} = FRT;
+ let Inst{11...15} = FRA;
+ let Inst{16...20} = FRB;
+ let Inst{21...24} = tttt;
+ let Inst{25...30} = xo;
let Inst{31} = 0;
}
@@ -708,10 +708,10 @@ class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: I<opcode, OOL, IOL, asmstr, itin> {
let Pattern = pattern;
- let Inst{6-10} = 31;
- let Inst{11-15} = 0;
- let Inst{16-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...10} = 31;
+ let Inst{11...15} = 0;
+ let Inst{16...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -721,11 +721,11 @@ class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
bits<2> L;
let Pattern = pattern;
- let Inst{6-8} = 0;
- let Inst{9-10} = L;
- let Inst{11-15} = 0;
- let Inst{16-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...8} = 0;
+ let Inst{9...10} = L;
+ let Inst{11...15} = 0;
+ let Inst{16...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -736,12 +736,12 @@ class XForm_IMM2_IMM2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
bits<2> PL;
let Pattern = pattern;
- let Inst{6-8} = 0;
- let Inst{9-10} = L;
- let Inst{11-13} = 0;
- let Inst{14-15} = PL;
- let Inst{16-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...8} = 0;
+ let Inst{9...10} = L;
+ let Inst{11...13} = 0;
+ let Inst{14...15} = PL;
+ let Inst{16...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -752,12 +752,12 @@ class XForm_IMM3_IMM2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
bits<2> SC;
let Pattern = pattern;
- let Inst{6-7} = 0;
- let Inst{8-10} = L;
- let Inst{11-13} = 0;
- let Inst{14-15} = SC;
- let Inst{16-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...7} = 0;
+ let Inst{8...10} = L;
+ let Inst{11...13} = 0;
+ let Inst{14...15} = SC;
+ let Inst{16...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -803,9 +803,9 @@ class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = RST;
- let Inst{11-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RST;
+ let Inst{11...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -816,9 +816,9 @@ class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = FM;
- let Inst{11-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...10} = FM;
+ let Inst{11...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -828,11 +828,11 @@ class XForm_44<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RT;
bits<3> BFA;
- let Inst{6-10} = RT;
- let Inst{11-13} = BFA;
- let Inst{14-15} = 0;
- let Inst{16-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...13} = BFA;
+ let Inst{14...15} = 0;
+ let Inst{16...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -842,11 +842,11 @@ class XForm_45<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RT;
bits<2> L;
- let Inst{6-10} = RT;
- let Inst{11-13} = 0;
- let Inst{14-15} = L;
- let Inst{16-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...13} = 0;
+ let Inst{14...15} = L;
+ let Inst{16...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -856,11 +856,11 @@ class X_FRT5_XO2_XO3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2, bits<10> xo,
: XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
let Pattern = pattern;
- let Inst{6-10} = RST;
- let Inst{11-12} = xo1;
- let Inst{13-15} = xo2;
- let Inst{16-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RST;
+ let Inst{11...12} = xo1;
+ let Inst{13...15} = xo2;
+ let Inst{16...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -871,11 +871,11 @@ class X_FRT5_XO2_XO3_FRB5_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
let Pattern = pattern;
bits<5> FRB;
- let Inst{6-10} = RST;
- let Inst{11-12} = xo1;
- let Inst{13-15} = xo2;
- let Inst{16-20} = FRB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RST;
+ let Inst{11...12} = xo1;
+ let Inst{13...15} = xo2;
+ let Inst{16...20} = FRB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -886,12 +886,12 @@ class X_FRT5_XO2_XO3_DRM3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
let Pattern = pattern;
bits<3> DRM;
- let Inst{6-10} = RST;
- let Inst{11-12} = xo1;
- let Inst{13-15} = xo2;
- let Inst{16-17} = 0;
- let Inst{18-20} = DRM;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RST;
+ let Inst{11...12} = xo1;
+ let Inst{13...15} = xo2;
+ let Inst{16...17} = 0;
+ let Inst{18...20} = DRM;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -902,12 +902,12 @@ class X_FRT5_XO2_XO3_RM2_X10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
let Pattern = pattern;
bits<2> RM;
- let Inst{6-10} = RST;
- let Inst{11-12} = xo1;
- let Inst{13-15} = xo2;
- let Inst{16-18} = 0;
- let Inst{19-20} = RM;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RST;
+ let Inst{11...12} = xo1;
+ let Inst{13...15} = xo2;
+ let Inst{16...18} = 0;
+ let Inst{19...20} = RM;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -934,10 +934,10 @@ class XForm_htm0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
bit RC = 1;
- let Inst{6-9} = 0;
+ let Inst{6...9} = 0;
let Inst{10} = R;
- let Inst{11-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{11...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -949,8 +949,8 @@ class XForm_htm1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
bit RC = 1;
let Inst{6} = A;
- let Inst{7-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{7...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -961,10 +961,10 @@ class XForm_htm2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{7-9} = 0;
+ let Inst{7...9} = 0;
let Inst{10} = L;
- let Inst{11-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{11...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -975,9 +975,9 @@ class XForm_htm3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0;
- let Inst{6-8} = BF;
- let Inst{9-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...8} = BF;
+ let Inst{9...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -992,12 +992,12 @@ class X_BF3_L1_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = BF;
+ let Inst{6...8} = BF;
let Inst{9} = 0;
let Inst{10} = L;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1011,11 +1011,11 @@ class X_BF3_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = BF;
- let Inst{9-10} = 0;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{6...8} = BF;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1035,10 +1035,10 @@ class X_BF3_DCMX7_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = BF;
- let Inst{9-15} = DCMX;
- let Inst{16-20} = VB;
- let Inst{21-30} = xo;
+ let Inst{6...8} = BF;
+ let Inst{9...15} = DCMX;
+ let Inst{16...20} = VB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1050,10 +1050,10 @@ class X_RD6_IMM8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-12} = 0;
- let Inst{13-20} = IMM8;
- let Inst{21-30} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...12} = 0;
+ let Inst{13...20} = IMM8;
+ let Inst{21...30} = xo;
let Inst{31} = XT{5};
}
@@ -1092,10 +1092,10 @@ class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = XT{5};
}
@@ -1117,10 +1117,10 @@ class XX2Form<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = 0;
- let Inst{16-20} = XB{4-0};
- let Inst{21-29} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = 0;
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...29} = xo;
let Inst{30} = XB{5};
let Inst{31} = XT{5};
}
@@ -1133,10 +1133,10 @@ class XX2Form_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-8} = CR;
- let Inst{9-15} = 0;
- let Inst{16-20} = XB{4-0};
- let Inst{21-29} = xo;
+ let Inst{6...8} = CR;
+ let Inst{9...15} = 0;
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...29} = xo;
let Inst{30} = XB{5};
let Inst{31} = 0;
}
@@ -1150,11 +1150,11 @@ class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-13} = 0;
- let Inst{14-15} = D;
- let Inst{16-20} = XB{4-0};
- let Inst{21-29} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...13} = 0;
+ let Inst{14...15} = D;
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...29} = xo;
let Inst{30} = XB{5};
let Inst{31} = XT{5};
}
@@ -1168,10 +1168,10 @@ class XX2_RD6_UIM5_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = UIM5;
- let Inst{16-20} = XB{4-0};
- let Inst{21-29} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = UIM5;
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...29} = xo;
let Inst{30} = XB{5};
let Inst{31} = XT{5};
}
@@ -1185,10 +1185,10 @@ class XX2_RD5_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = xo2;
- let Inst{16-20} = XB{4-0};
- let Inst{21-29} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = xo2;
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...29} = xo;
let Inst{30} = XB{5};
let Inst{31} = 0;
}
@@ -1202,10 +1202,10 @@ class XX2_RD6_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = xo2;
- let Inst{16-20} = XB{4-0};
- let Inst{21-29} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = xo2;
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...29} = xo;
let Inst{30} = XB{5};
let Inst{31} = XT{5};
}
@@ -1219,10 +1219,10 @@ class XX2_BF3_DCMX7_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = BF;
- let Inst{9-15} = DCMX;
- let Inst{16-20} = XB{4-0};
- let Inst{21-29} = xo;
+ let Inst{6...8} = BF;
+ let Inst{9...15} = DCMX;
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...29} = xo;
let Inst{30} = XB{5};
let Inst{31} = 0;
}
@@ -1237,12 +1237,12 @@ class XX2_RD6_DCMX7_RS6<bits<6> opcode, bits<4> xo1, bits<3> xo2,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = DCMX{4-0};
- let Inst{16-20} = XB{4-0};
- let Inst{21-24} = xo1;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = DCMX{4...0};
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...24} = xo1;
let Inst{25} = DCMX{6};
- let Inst{26-28} = xo2;
+ let Inst{26...28} = xo2;
let Inst{29} = DCMX{5};
let Inst{30} = XB{5};
let Inst{31} = XT{5};
@@ -1257,10 +1257,10 @@ class XForm_XD6_RA5_RB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = D{4-0}; // D
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = D{4...0}; // D
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = D{5}; // DX
}
@@ -1273,11 +1273,11 @@ class XForm_BF3_UIM6_FRB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = BF;
+ let Inst{6...8} = BF;
let Inst{9} = 0;
- let Inst{10-15} = UIM;
- let Inst{16-20} = FRB;
- let Inst{21-30} = xo;
+ let Inst{10...15} = UIM;
+ let Inst{16...20} = FRB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1292,11 +1292,11 @@ class XForm_SP2_FRTB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asms
bit RC = 0; // set by isRecordForm
- let Inst{6 - 10} = FRT;
- let Inst{11 - 12} = SP;
- let Inst{13 - 15} = 0;
- let Inst{16 - 20} = FRB;
- let Inst{21 - 30} = xo;
+ let Inst{6...10} = FRT;
+ let Inst{11...12} = SP;
+ let Inst{13...15} = 0;
+ let Inst{16...20} = FRB;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -1311,11 +1311,11 @@ class XForm_S1_FRTB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
bit RC = 0; // set by isRecordForm
- let Inst{6 - 10} = FRT;
+ let Inst{6...10} = FRT;
let Inst{11} = S;
- let Inst{12 - 15} = 0;
- let Inst{16 - 20} = FRB;
- let Inst{21 - 30} = xo;
+ let Inst{12...15} = 0;
+ let Inst{16...20} = FRB;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -1328,10 +1328,10 @@ class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = XA{4-0};
- let Inst{16-20} = XB{4-0};
- let Inst{21-28} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = XA{4...0};
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...28} = xo;
let Inst{29} = XA{5};
let Inst{30} = XB{5};
let Inst{31} = XT{5};
@@ -1353,11 +1353,11 @@ class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-8} = CR;
- let Inst{9-10} = 0;
- let Inst{11-15} = XA{4-0};
- let Inst{16-20} = XB{4-0};
- let Inst{21-28} = xo;
+ let Inst{6...8} = CR;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = XA{4...0};
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...28} = xo;
let Inst{29} = XA{5};
let Inst{30} = XB{5};
let Inst{31} = 0;
@@ -1373,12 +1373,12 @@ class XX3Form_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = XA{4-0};
- let Inst{16-20} = XB{4-0};
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = XA{4...0};
+ let Inst{16...20} = XB{4...0};
let Inst{21} = 0;
- let Inst{22-23} = D;
- let Inst{24-28} = xo;
+ let Inst{22...23} = D;
+ let Inst{24...28} = xo;
let Inst{29} = XA{5};
let Inst{30} = XB{5};
let Inst{31} = XT{5};
@@ -1395,11 +1395,11 @@ class XX3Form_Rc<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = XA{4-0};
- let Inst{16-20} = XB{4-0};
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = XA{4...0};
+ let Inst{16...20} = XB{4...0};
let Inst{21} = RC;
- let Inst{22-28} = xo;
+ let Inst{22...28} = xo;
let Inst{29} = XA{5};
let Inst{30} = XB{5};
let Inst{31} = XT{5};
@@ -1415,11 +1415,11 @@ class XX4Form<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = XA{4-0};
- let Inst{16-20} = XB{4-0};
- let Inst{21-25} = XC{4-0};
- let Inst{26-27} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = XA{4...0};
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...25} = XC{4...0};
+ let Inst{26...27} = xo;
let Inst{28} = XC{5};
let Inst{29} = XA{5};
let Inst{30} = XB{5};
@@ -1435,10 +1435,10 @@ class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = immfield;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = immfield;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1451,10 +1451,10 @@ class DCB_Form_hint<bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = TH;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = TH;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1469,11 +1469,11 @@ class DSS_Form<bits<1> T, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
let Inst{6} = T;
- let Inst{7-8} = 0;
- let Inst{9-10} = STRM;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{7...8} = 0;
+ let Inst{9...10} = STRM;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1487,10 +1487,10 @@ class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = CRD;
- let Inst{11-15} = CRA;
- let Inst{16-20} = CRB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = CRD;
+ let Inst{11...15} = CRA;
+ let Inst{16...20} = CRB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1527,10 +1527,10 @@ class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = CRD;
- let Inst{11-15} = CRD;
- let Inst{16-20} = CRD;
- let Inst{21-30} = xo;
+ let Inst{6...10} = CRD;
+ let Inst{11...15} = CRD;
+ let Inst{16...20} = CRD;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1543,11 +1543,11 @@ class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asm
let Pattern = pattern;
- let Inst{6-10} = BO;
- let Inst{11-15} = BI;
- let Inst{16-18} = 0;
- let Inst{19-20} = BH;
- let Inst{21-30} = xo;
+ let Inst{6...10} = BO;
+ let Inst{11...15} = BI;
+ let Inst{16...18} = 0;
+ let Inst{19...20} = BH;
+ let Inst{21...30} = xo;
let Inst{31} = lk;
}
@@ -1557,9 +1557,9 @@ class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
bits<7> BIBO; // 2 bits of BI and 5 bits of BO.
bits<3> CR;
- let BO = BIBO{4-0};
- let BI{0-1} = BIBO{5-6};
- let BI{2-4} = CR{0-2};
+ let BO = BIBO{4...0};
+ let BI{0...1} = BIBO{5...6};
+ let BI{2...4} = CR{0...2};
let BH = 0;
}
@@ -1584,12 +1584,12 @@ class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<3> BF;
bits<3> BFA;
- let Inst{6-8} = BF;
- let Inst{9-10} = 0;
- let Inst{11-13} = BFA;
- let Inst{14-15} = 0;
- let Inst{16-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...8} = BF;
+ let Inst{9...10} = 0;
+ let Inst{11...13} = BFA;
+ let Inst{14...15} = 0;
+ let Inst{16...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1602,13 +1602,13 @@ class XLForm_4<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0;
- let Inst{6-8} = BF;
- let Inst{9-10} = 0;
- let Inst{11-14} = 0;
+ let Inst{6...8} = BF;
+ let Inst{9...10} = 0;
+ let Inst{11...14} = 0;
let Inst{15} = W;
- let Inst{16-19} = U;
+ let Inst{16...19} = U;
let Inst{20} = 0;
- let Inst{21-30} = xo;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -1619,9 +1619,9 @@ class XLForm_S<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-19} = 0;
+ let Inst{6...19} = 0;
let Inst{20} = S;
- let Inst{21-30} = xo;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1640,17 +1640,17 @@ class XLForm_2_and_DSForm_1<bits<6> opcode1, bits<10> xo1, bit lk,
let Pattern = pattern;
- let Inst{6-10} = BO;
- let Inst{11-15} = BI;
- let Inst{16-18} = 0;
- let Inst{19-20} = BH;
- let Inst{21-30} = xo1;
+ let Inst{6...10} = BO;
+ let Inst{11...15} = BI;
+ let Inst{16...18} = 0;
+ let Inst{19...20} = BH;
+ let Inst{21...30} = xo1;
let Inst{31} = lk;
- let Inst{38-42} = RST;
- let Inst{43-47} = RA;
- let Inst{48-61} = D;
- let Inst{62-63} = xo2;
+ let Inst{38...42} = RST;
+ let Inst{43...47} = RA;
+ let Inst{48...61} = D;
+ let Inst{62...63} = xo2;
}
class XLForm_2_ext_and_DSForm_1<bits<6> opcode1, bits<10> xo1,
@@ -1677,16 +1677,16 @@ class XLForm_2_ext_and_DForm_1<bits<6> opcode1, bits<10> xo1, bits<5> bo,
let Pattern = pattern;
- let Inst{6-10} = bo;
- let Inst{11-15} = bi;
- let Inst{16-18} = 0;
- let Inst{19-20} = 0; // Unused (BH)
- let Inst{21-30} = xo1;
+ let Inst{6...10} = bo;
+ let Inst{11...15} = bi;
+ let Inst{16...18} = 0;
+ let Inst{19...20} = 0; // Unused (BH)
+ let Inst{21...30} = xo1;
let Inst{31} = lk;
- let Inst{38-42} = RST;
- let Inst{43-47} = RA;
- let Inst{48-63} = D;
+ let Inst{38...42} = RST;
+ let Inst{43...47} = RA;
+ let Inst{48...63} = D;
}
// 1.7.8 XFX-Form
@@ -1696,7 +1696,7 @@ class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RST;
bits<10> SPR;
- let Inst{6-10} = RST;
+ let Inst{6...10} = RST;
let Inst{11} = SPR{4};
let Inst{12} = SPR{3};
let Inst{13} = SPR{2};
@@ -1707,7 +1707,7 @@ class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Inst{18} = SPR{7};
let Inst{19} = SPR{6};
let Inst{20} = SPR{5};
- let Inst{21-30} = xo;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1722,9 +1722,9 @@ class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
: I<opcode, OOL, IOL, asmstr, itin> {
bits<5> RT;
- let Inst{6-10} = RT;
- let Inst{11-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1735,9 +1735,9 @@ class XFXForm_3p<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<10> imm;
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-20} = imm;
- let Inst{21-30} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...20} = imm;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1747,11 +1747,11 @@ class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<8> FXM;
bits<5> RST;
- let Inst{6-10} = RST;
+ let Inst{6...10} = RST;
let Inst{11} = 0;
- let Inst{12-19} = FXM;
+ let Inst{12...19} = FXM;
let Inst{20} = 0;
- let Inst{21-30} = xo;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1761,11 +1761,11 @@ class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
bits<5> RST;
bits<8> FXM;
- let Inst{6-10} = RST;
+ let Inst{6...10} = RST;
let Inst{11} = 1;
- let Inst{12-19} = FXM;
+ let Inst{12...19} = FXM;
let Inst{20} = 0;
- let Inst{21-30} = xo;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -1782,10 +1782,10 @@ class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
let Inst{6} = 0;
- let Inst{7-14} = FM;
+ let Inst{7...14} = FM;
let Inst{15} = 0;
- let Inst{16-20} = RT;
- let Inst{21-30} = xo;
+ let Inst{16...20} = RT;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -1801,10 +1801,10 @@ class XFLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
let Inst{6} = L;
- let Inst{7-14} = FLM;
+ let Inst{7...14} = FLM;
let Inst{15} = W;
- let Inst{16-20} = FRB;
- let Inst{21-30} = xo;
+ let Inst{16...20} = FRB;
+ let Inst{21...30} = xo;
let Inst{31} = RC;
}
@@ -1819,10 +1819,10 @@ class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
let Pattern = pattern;
- let Inst{6-10} = RS;
- let Inst{11-15} = RA;
- let Inst{16-20} = SH{4,3,2,1,0};
- let Inst{21-29} = xo;
+ let Inst{6...10} = RS;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = SH{4,3,2,1,0};
+ let Inst{21...29} = xo;
let Inst{30} = SH{5};
let Inst{31} = RC;
}
@@ -1839,11 +1839,11 @@ class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asms
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
let Inst{21} = oe;
- let Inst{22-30} = xo;
+ let Inst{22...30} = xo;
let Inst{31} = RC;
}
@@ -1866,11 +1866,11 @@ class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = FRT;
- let Inst{11-15} = FRA;
- let Inst{16-20} = FRB;
- let Inst{21-25} = FRC;
- let Inst{26-30} = xo;
+ let Inst{6...10} = FRT;
+ let Inst{11...15} = FRA;
+ let Inst{16...20} = FRB;
+ let Inst{21...25} = FRC;
+ let Inst{26...30} = xo;
let Inst{31} = RC;
}
@@ -1896,11 +1896,11 @@ class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-25} = COND;
- let Inst{26-30} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...25} = COND;
+ let Inst{26...30} = xo;
let Inst{31} = 0;
}
@@ -1918,11 +1918,11 @@ class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = RS;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-25} = MB;
- let Inst{26-30} = ME;
+ let Inst{6...10} = RS;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...25} = MB;
+ let Inst{26...30} = ME;
let Inst{31} = RC;
}
@@ -1939,11 +1939,11 @@ class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = RS;
- let Inst{11-15} = RA;
- let Inst{16-20} = SH;
- let Inst{21-25} = MB;
- let Inst{26-30} = ME;
+ let Inst{6...10} = RS;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = SH;
+ let Inst{21...25} = MB;
+ let Inst{26...30} = ME;
let Inst{31} = RC;
}
@@ -1960,11 +1960,11 @@ class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = RS;
- let Inst{11-15} = RA;
- let Inst{16-20} = SH{4,3,2,1,0};
- let Inst{21-26} = MBE{4,3,2,1,0,5};
- let Inst{27-29} = xo;
+ let Inst{6...10} = RS;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = SH{4,3,2,1,0};
+ let Inst{21...26} = MBE{4,3,2,1,0,5};
+ let Inst{27...29} = xo;
let Inst{30} = SH{5};
let Inst{31} = RC;
}
@@ -1981,11 +1981,11 @@ class MDSForm_1<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = RS;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-26} = MBE{4,3,2,1,0,5};
- let Inst{27-30} = xo;
+ let Inst{6...10} = RS;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...26} = MBE{4,3,2,1,0,5};
+ let Inst{27...30} = xo;
let Inst{31} = RC;
}
@@ -2003,11 +2003,11 @@ class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-25} = RC;
- let Inst{26-31} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...25} = RC;
+ let Inst{26...31} = xo;
}
// VAForm_1a - DABC ordering.
@@ -2021,11 +2021,11 @@ class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-25} = RC;
- let Inst{26-31} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...25} = RC;
+ let Inst{26...31} = xo;
}
class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
@@ -2038,12 +2038,12 @@ class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
let Inst{21} = 0;
- let Inst{22-25} = SH;
- let Inst{26-31} = xo;
+ let Inst{22...25} = SH;
+ let Inst{26...31} = xo;
}
// E-2 VX-Form
@@ -2056,10 +2056,10 @@ class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = VA;
- let Inst{16-20} = VB;
- let Inst{21-31} = xo;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = VA;
+ let Inst{16...20} = VB;
+ let Inst{21...31} = xo;
}
class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -2078,10 +2078,10 @@ class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = 0;
- let Inst{16-20} = VB;
- let Inst{21-31} = xo;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = 0;
+ let Inst{16...20} = VB;
+ let Inst{21...31} = xo;
}
class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -2092,10 +2092,10 @@ class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = IMM;
- let Inst{16-20} = 0;
- let Inst{21-31} = xo;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = IMM;
+ let Inst{16...20} = 0;
+ let Inst{21...31} = xo;
}
/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr.
@@ -2106,10 +2106,10 @@ class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = 0;
- let Inst{16-20} = 0;
- let Inst{21-31} = xo;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = 0;
+ let Inst{16...20} = 0;
+ let Inst{21...31} = xo;
}
/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr.
@@ -2120,10 +2120,10 @@ class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = 0;
- let Inst{11-15} = 0;
- let Inst{16-20} = VB;
- let Inst{21-31} = xo;
+ let Inst{6...10} = 0;
+ let Inst{11...15} = 0;
+ let Inst{16...20} = VB;
+ let Inst{21...31} = xo;
}
// e.g. [PO VRT EO VRB XO]
@@ -2135,10 +2135,10 @@ class VXForm_RD5_XO5_RS5<bits<11> xo, bits<5> eo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = eo;
- let Inst{16-20} = VB;
- let Inst{21-31} = xo;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = eo;
+ let Inst{16...20} = VB;
+ let Inst{21...31} = xo;
}
/// VXForm_CR - VX crypto instructions with "VRT, VRA, ST, SIX"
@@ -2152,11 +2152,11 @@ class VXForm_CR<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = VA;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = VA;
let Inst{16} = ST;
- let Inst{17-20} = SIX;
- let Inst{21-31} = xo;
+ let Inst{17...20} = SIX;
+ let Inst{21...31} = xo;
}
/// VXForm_BX - VX crypto instructions with "VRT, VRA, 0 - like vsbox"
@@ -2168,10 +2168,10 @@ class VXForm_BX<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = VA;
- let Inst{16-20} = 0;
- let Inst{21-31} = xo;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = VA;
+ let Inst{16...20} = 0;
+ let Inst{21...31} = xo;
}
// E-4 VXR-Form
@@ -2185,11 +2185,11 @@ class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = VA;
- let Inst{16-20} = VB;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = VA;
+ let Inst{16...20} = VB;
let Inst{21} = RC;
- let Inst{22-31} = xo;
+ let Inst{22...31} = xo;
}
// VX-Form: [PO VRT EO VRB 1 PS XO]
@@ -2203,12 +2203,12 @@ class VX_RD5_EO5_RS5_PS1_XO9<bits<5> eo, bits<9> xo,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = eo;
- let Inst{16-20} = VB;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = eo;
+ let Inst{16...20} = VB;
let Inst{21} = 1;
let Inst{22} = PS;
- let Inst{23-31} = xo;
+ let Inst{23...31} = xo;
}
// VX-Form: [PO VRT VRA VRB 1 PS XO] or [PO VRT VRA VRB 1 / XO]
@@ -2222,12 +2222,12 @@ class VX_RD5_RSp5_PS1_XO9<bits<9> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VD;
- let Inst{11-15} = VA;
- let Inst{16-20} = VB;
+ let Inst{6...10} = VD;
+ let Inst{11...15} = VA;
+ let Inst{16...20} = VB;
let Inst{21} = 1;
let Inst{22} = PS;
- let Inst{23-31} = xo;
+ let Inst{23...31} = xo;
}
class Z22Form_BF3_FRA5_DCM6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
@@ -2240,11 +2240,11 @@ class Z22Form_BF3_FRA5_DCM6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = BF;
- let Inst{9-10} = 0;
- let Inst{11-15} = FRA;
- let Inst{16-21} = DCM;
- let Inst{22-30} = xo;
+ let Inst{6...8} = BF;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = FRA;
+ let Inst{16...21} = DCM;
+ let Inst{22...30} = xo;
let Inst{31} = 0;
}
@@ -2260,10 +2260,10 @@ class Z22Form_FRTA5_SH6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
bit RC = 0; // set by isRecordForm
- let Inst{6 - 10} = FRT;
- let Inst{11 - 15} = FRA;
- let Inst{16 - 21} = SH;
- let Inst{22 - 30} = xo;
+ let Inst{6...10} = FRT;
+ let Inst{11...15} = FRA;
+ let Inst{16...21} = SH;
+ let Inst{22...30} = xo;
let Inst{31} = RC;
}
@@ -2279,12 +2279,12 @@ class Z23Form_8<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
bit RC = 0; // set by isRecordForm
- let Inst{6-10} = VRT;
- let Inst{11-14} = 0;
+ let Inst{6...10} = VRT;
+ let Inst{11...14} = 0;
let Inst{15} = R;
- let Inst{16-20} = VRB;
- let Inst{21-22} = idx;
- let Inst{23-30} = xo;
+ let Inst{16...20} = VRB;
+ let Inst{21...22} = idx;
+ let Inst{23...30} = xo;
let Inst{31} = RC;
}
@@ -2298,11 +2298,11 @@ class Z23Form_RTAB5_CY2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-22} = CY;
- let Inst{23-30} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...22} = CY;
+ let Inst{23...30} = xo;
let Inst{31} = 0;
}
@@ -2318,11 +2318,11 @@ class Z23Form_FRTAB5_RMC2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
bit RC = 0; // set by isRecordForm
- let Inst{6 - 10} = FRT;
- let Inst{11 - 15} = FRA;
- let Inst{16 - 20} = FRB;
- let Inst{21 - 22} = RMC;
- let Inst{23 - 30} = xo;
+ let Inst{6...10} = FRT;
+ let Inst{11...15} = FRA;
+ let Inst{16...20} = FRB;
+ let Inst{21...22} = RMC;
+ let Inst{23...30} = xo;
let Inst{31} = RC;
}
@@ -2345,12 +2345,12 @@ class Z23Form_FRTB5_R1_RMC2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
bit RC = 0; // set by isRecordForm
- let Inst{6 - 10} = FRT;
- let Inst{11 - 14} = 0;
+ let Inst{6...10} = FRT;
+ let Inst{11...14} = 0;
let Inst{15} = R;
- let Inst{16 - 20} = FRB;
- let Inst{21 - 22} = RMC;
- let Inst{23 - 30} = xo;
+ let Inst{16...20} = FRB;
+ let Inst{21...22} = RMC;
+ let Inst{23...30} = xo;
let Inst{31} = RC;
}
@@ -2362,7 +2362,7 @@ class PPCEmitTimePseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
let isCodeGenOnly = 1;
let PPC64 = 0;
let Pattern = pattern;
- let Inst{31-0} = 0;
+ let Inst{31...0} = 0;
let hasNoSchedulingInfo = 1;
}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 80fac18d5737..a12dfae2a0d7 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -13,7 +13,7 @@
class XOForm_RTAB5_L1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
string asmstr, list<dag> pattern>
- : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+ : I<opcode, OOL, IOL, asmstr, NoItinerary> {
bits<5> RT;
bits<5> RA;
bits<5> RB;
@@ -21,64 +21,174 @@ class XOForm_RTAB5_L1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
let Pattern = pattern;
- bit RC = 0; // set by isRecordForm
+ bit RC = 0; // set by isRecordForm
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21} = L;
- let Inst{22-30} = xo;
- let Inst{31} = RC;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21} = L;
+ let Inst{22...30} = xo;
+ let Inst{31} = RC;
}
multiclass XOForm_RTAB5_L1r<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
- string asmbase, string asmstr,
- list<dag> pattern> {
+ string asmbase, string asmstr, list<dag> pattern> {
let BaseName = asmbase in {
def NAME : XOForm_RTAB5_L1<opcode, xo, OOL, IOL,
!strconcat(asmbase, !strconcat(" ", asmstr)),
- pattern>, RecFormRel;
- let Defs = [CR0] in
- def _rec : XOForm_RTAB5_L1<opcode, xo, OOL, IOL,
- !strconcat(asmbase, !strconcat(". ", asmstr)),
- []>, isRecordForm, RecFormRel;
+ pattern>,
+ RecFormRel;
+ let Defs = [CR0] in def _rec
+ : XOForm_RTAB5_L1<opcode, xo, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)), []>,
+ isRecordForm, RecFormRel;
}
}
+class VXForm_VRTB5<bits<11> xo, bits<5> R, dag OOL, dag IOL, string asmstr,
+ list<dag> pattern> : I<4, OOL, IOL, asmstr, NoItinerary> {
+ bits<5> VRT;
+ bits<5> VRB;
+
+ let Pattern = pattern;
+
+ let Inst{6...10} = VRT;
+ let Inst{11...15} = R;
+ let Inst{16...20} = VRB;
+ let Inst{21...31} = xo;
+}
+
+class VXForm_VRTB5_UIM2<bits<11> xo, bits<3> R, dag OOL, dag IOL, string asmstr,
+ list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, NoItinerary> {
+ bits<5> VRT;
+ bits<5> VRB;
+ bits<2> UIM;
+
+ let Pattern = pattern;
+
+ let Inst{6...10} = VRT;
+ let Inst{11...13} = R;
+ let Inst{14...15} = UIM;
+ let Inst{16...20} = VRB;
+ let Inst{21...31} = xo;
+}
+
+class VXForm_VRTB5_UIM1<bits<11> xo, bits<4> R, dag OOL, dag IOL, string asmstr,
+ list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, NoItinerary> {
+ bits<5> VRT;
+ bits<5> VRB;
+ bits<1> UIM;
+
+ let Pattern = pattern;
+
+ let Inst{6...10} = VRT;
+ let Inst{11...14} = R;
+ let Inst{15} = UIM;
+ let Inst{16...20} = VRB;
+ let Inst{21...31} = xo;
+}
+
+class VXForm_VRTB5_UIM3<bits<11> xo, bits<2> R, dag OOL, dag IOL, string asmstr,
+ list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, NoItinerary> {
+ bits<5> VRT;
+ bits<5> VRB;
+ bits<3> UIM;
+
+ let Pattern = pattern;
+
+ let Inst{6...10} = VRT;
+ let Inst{11...12} = R;
+ let Inst{13...15} = UIM;
+ let Inst{16...20} = VRB;
+ let Inst{21...31} = xo;
+}
+
+class VXForm_VRTAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ list<dag> pattern> : I<4, OOL, IOL, asmstr, NoItinerary> {
+ bits<5> VRT;
+ bits<5> VRA;
+ bits<5> VRB;
+
+ let Pattern = pattern;
+
+ let Inst{6...10} = VRT;
+ let Inst{11...15} = VRA;
+ let Inst{16...20} = VRB;
+ let Inst{21...31} = xo;
+}
+
let Predicates = [IsISAFuture] in {
defm SUBFUS : XOForm_RTAB5_L1r<31, 72, (outs g8rc:$RT),
- (ins g8rc:$RA, g8rc:$RB, u1imm:$L),
- "subfus", "$RT, $L, $RA, $RB", []>;
+ (ins g8rc:$RA, g8rc:$RB, u1imm:$L), "subfus",
+ "$RT, $L, $RA, $RB", []>;
}
let Predicates = [HasVSX, IsISAFuture] in {
let mayLoad = 1 in {
- def LXVRL
- : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
- "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
- def LXVRLL
- : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
- "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
- def LXVPRL
- : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
- "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
- def LXVPRLL
- : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
- "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
+ def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT),
+ (ins (memr $RA):$addr, g8rc:$RB),
+ "lxvrl $XT, $addr, $RB", IIC_LdStLoad, []>;
+ def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT),
+ (ins (memr $RA):$addr, g8rc:$RB),
+ "lxvrll $XT, $addr, $RB", IIC_LdStLoad, []>;
+ def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp),
+ (ins (memr $RA):$addr, g8rc:$RB),
+ "lxvprl $XTp, $addr, $RB", IIC_LdStLFD, []>;
+ def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp),
+ (ins (memr $RA):$addr, g8rc:$RB),
+ "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
}
let mayStore = 1 in {
- def STXVRL
- : XX1Form_memOp<31, 653, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
- "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
- def STXVRLL
- : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
- "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def STXVRL : XX1Form_memOp<31, 653, (outs),
+ (ins vsrc:$XT, (memr $RA):$addr, g8rc:$RB),
+ "stxvrl $XT, $addr, $RB", IIC_LdStLoad, []>;
+ def STXVRLL : XX1Form_memOp<31, 685, (outs),
+ (ins vsrc:$XT, (memr $RA):$addr, g8rc:$RB),
+ "stxvrll $XT, $addr, $RB", IIC_LdStLoad, []>;
def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs),
- (ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
- "stxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
- def STXVPRLL : XForm_XTp5_XAB5<31, 749, (outs),
- (ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
- "stxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
+ (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
+ "stxvprl $XTp, $addr, $RB", IIC_LdStLFD, []>;
+ def STXVPRLL
+ : XForm_XTp5_XAB5<31, 749, (outs),
+ (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
+ "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
}
+
+ def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB),
+ "vupkhsntob $VRT, $VRB", []>;
+ def VUPKLSNTOB : VXForm_VRTB5<387, 1, (outs vrrc:$VRT), (ins vrrc:$VRB),
+ "vupklsntob $VRT, $VRB", []>;
+ def VUPKINT4TOBF16
+ : VXForm_VRTB5_UIM2<387, 2, (outs vrrc:$VRT), (ins vrrc:$VRB, u2imm:$UIM),
+ "vupkint4tobf16 $VRT, $VRB, $UIM", []>;
+ def VUPKINT8TOBF16
+ : VXForm_VRTB5_UIM1<387, 1, (outs vrrc:$VRT), (ins vrrc:$VRB, u1imm:$UIM),
+ "vupkint8tobf16 $VRT, $VRB, $UIM", []>;
+ def VUPKINT8TOFP32
+ : VXForm_VRTB5_UIM2<387, 3, (outs vrrc:$VRT), (ins vrrc:$VRB, u2imm:$UIM),
+ "vupkint8tofp32 $VRT, $VRB, $UIM", []>;
+ def VUPKINT4TOFP32
+ : VXForm_VRTB5_UIM3<387, 2, (outs vrrc:$VRT), (ins vrrc:$VRB, u3imm:$UIM),
+ "vupkint4tofp32 $VRT, $VRB, $UIM", []>;
+
+ def VUCMPRHN : VXForm_VRTAB5<3, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+ "vucmprhn $VRT, $VRA, $VRB", []>;
+ def VUCMPRLN : VXForm_VRTAB5<67, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+ "vucmprln $VRT, $VRA, $VRB", []>;
+ def VUCMPRHB
+ : VXForm_VRTAB5<131, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+ "vucmprhb $VRT, $VRA, $VRB", []>;
+ def VUCMPRLB
+ : VXForm_VRTAB5<195, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+ "vucmprlb $VRT, $VRA, $VRB", []>;
+ def VUCMPRHH
+ : VXForm_VRTAB5<259, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+ "vucmprhh $VRT, $VRA, $VRB", []>;
+ def VUCMPRLH
+ : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+ "vucmprlh $VRT, $VRA, $VRB", []>;
}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
index ef8b27f9b8d3..884895793752 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
@@ -8,12 +8,13 @@
//===----------------------------------------------------------------------===//
//
// This file describes the instructions introduced for the Future CPU for MMA.
+// Please reference "PPCInstrVSX.td" for file structure.
//
//===----------------------------------------------------------------------===//
class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
string asmstr, list<dag> pattern>
- : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+ : I<opcode, OOL, IOL, asmstr, NoItinerary> {
bits<3> AT;
bits<5> XAp;
bits<5> XBp;
@@ -21,13 +22,13 @@ class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = AT{2-0};
- let Inst{9-10} = 0;
- let Inst{11-14} = XAp{3-0};
+ let Inst{6...8} = AT{2...0};
+ let Inst{9...10} = 0;
+ let Inst{11...14} = XAp{3...0};
let Inst{15} = P;
- let Inst{16-19} = XBp{3-0};
+ let Inst{16...19} = XBp{3...0};
let Inst{20} = 0;
- let Inst{21-28} = xo;
+ let Inst{21...28} = xo;
let Inst{29} = XAp{4};
let Inst{30} = XBp{4};
let Inst{31} = 0;
@@ -35,65 +36,64 @@ class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
class XX2Form_AT3_XBp5_P2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
string asmstr, list<dag> pattern>
- : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+ : I<opcode, OOL, IOL, asmstr, NoItinerary> {
bits<3> AT;
bits<5> XBp;
bits<2> P;
let Pattern = pattern;
- let Inst{6-8} = AT{2-0};
- let Inst{9-14} = 0;
+ let Inst{6...8} = AT{2...0};
+ let Inst{9...14} = 0;
let Inst{15} = P{0};
- let Inst{16-19} = XBp{3-0};
+ let Inst{16...19} = XBp{3...0};
let Inst{20} = P{1};
- let Inst{21-29} = xo;
+ let Inst{21...29} = xo;
let Inst{30} = XBp{4};
let Inst{31} = 0;
}
class XForm_ATB3<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL,
string asmstr, list<dag> pattern>
- : I <opcode, OOL, IOL, asmstr, NoItinerary> {
+ : I<opcode, OOL, IOL, asmstr, NoItinerary> {
bits<3> AT;
bits<3> AB;
let Pattern = pattern;
- let Inst{6-8} = AT{2-0};
- let Inst{9-10} = 0;
- let Inst{11-15} = o;
- let Inst{16-18} = AB{2-0};
- let Inst{19-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...8} = AT{2...0};
+ let Inst{9...10} = 0;
+ let Inst{11...15} = o;
+ let Inst{16...18} = AB{2...0};
+ let Inst{19...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
class XX3Form_AT3_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
- string asmstr, InstrItinClass itin,
- list<dag> pattern>
- : I<opcode, OOL, IOL, asmstr, itin> {
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
bits<3> AT;
bits<5> XAp;
bits<6> XB;
let Pattern = pattern;
- let Inst{6-8} = AT;
- let Inst{9-10} = 0;
- let Inst{11-14} = XAp{3-0};
+ let Inst{6...8} = AT;
+ let Inst{9...10} = 0;
+ let Inst{11...14} = XAp{3...0};
let Inst{15} = 0;
- let Inst{16-20} = XB{4-0};
- let Inst{21-28} = xo;
- let Inst{29} = XAp{4};
- let Inst{30} = XB{5};
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...28} = xo;
+ let Inst{29} = XAp{4};
+ let Inst{30} = XB{5};
let Inst{31} = 0;
}
class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
string asmstr, InstrItinClass itin,
list<dag> pattern>
- : PI<1, opcode, OOL, IOL, asmstr, itin> {
+ : PI<1, opcode, OOL, IOL, asmstr, itin> {
bits<3> AT;
bits<5> XAp;
bits<6> XB;
@@ -104,29 +104,29 @@ class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 3;
- let Inst{8-11} = 9;
- let Inst{12-15} = 0;
- let Inst{16-19} = PMSK;
- let Inst{20-27} = XMSK;
- let Inst{28-31} = YMSK;
+ let Inst{6...7} = 3;
+ let Inst{8...11} = 9;
+ let Inst{12...15} = 0;
+ let Inst{16...19} = PMSK;
+ let Inst{20...27} = XMSK;
+ let Inst{28...31} = YMSK;
// The instruction.
- let Inst{38-40} = AT;
- let Inst{41-42} = 0;
- let Inst{43-46} = XAp{3-0};
+ let Inst{38...40} = AT;
+ let Inst{41...42} = 0;
+ let Inst{43...46} = XAp{3...0};
let Inst{47} = 0;
- let Inst{48-52} = XB{4-0};
- let Inst{53-60} = xo;
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...60} = xo;
let Inst{61} = XAp{4};
let Inst{62} = XB{5};
let Inst{63} = 0;
}
class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
- string asmstr, InstrItinClass itin,
- list<dag> pattern>
- : PI<1, opcode, OOL, IOL, asmstr, itin> {
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : PI<1, opcode, OOL, IOL, asmstr, itin> {
bits<3> AT;
bits<5> XAp;
bits<6> XB;
@@ -137,21 +137,21 @@ class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 3;
- let Inst{8-11} = 9;
- let Inst{12-15} = 0;
- let Inst{16-17} = PMSK;
- let Inst{18-19} = 0;
- let Inst{20-27} = XMSK;
- let Inst{28-31} = YMSK;
+ let Inst{6...7} = 3;
+ let Inst{8...11} = 9;
+ let Inst{12...15} = 0;
+ let Inst{16...17} = PMSK;
+ let Inst{18...19} = 0;
+ let Inst{20...27} = XMSK;
+ let Inst{28...31} = YMSK;
// The instruction.
- let Inst{38-40} = AT;
- let Inst{41-42} = 0;
- let Inst{43-46} = XAp{3-0};
+ let Inst{38...40} = AT;
+ let Inst{41...42} = 0;
+ let Inst{43...46} = XAp{3...0};
let Inst{47} = 0;
- let Inst{48-52} = XB{4-0};
- let Inst{53-60} = xo;
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...60} = xo;
let Inst{61} = XAp{4};
let Inst{62} = XB{5};
let Inst{63} = 0;
@@ -160,14 +160,15 @@ class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
multiclass DMR_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
string asmstr> {
let Predicates = [MMA, IsISAFuture] in {
- def NAME :
- XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x01), (outs dmr:$AT), IOL,
- !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
- RegConstraint<"@earlyclobber $AT">;
- def PP :
- XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
- !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def NAME
+ : XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x01), (outs dmr:$AT), IOL,
+ !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PP
+ : XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT),
+ !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -175,202 +176,217 @@ multiclass DMR_UM_M448_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
string asmstr> {
defm NAME : DMR_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
- def PM#NAME :
- MMIRR_XX3Form_X8YP4_XAp5B6<
- opcode, !or(xo, 0x01), (outs dmr:$AT),
- !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
- !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"@earlyclobber $AT">;
- def PM#NAME#PP :
- MMIRR_XX3Form_X8YP4_XAp5B6<
- opcode, xo, (outs dmr:$AT),
- !con((ins dmr:$ATi),
- !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
- !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME
+ : MMIRR_XX3Form_X8YP4_XAp5B6<
+ opcode, !or(xo, 0x01), (outs dmr:$AT),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP
+ : MMIRR_XX3Form_X8YP4_XAp5B6<
+ opcode, xo, (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
}
}
multiclass DMR_BF16_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
- string asmstr> {
+ string asmstr> {
let Predicates = [MMA, IsISAFuture] in {
- def NAME :
- XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x11), (outs dmr:$AT), IOL,
- !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
- RegConstraint<"@earlyclobber $AT">;
- def PP :
- XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
- !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def NAME
+ : XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x11), (outs dmr:$AT), IOL,
+ !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PP
+ : XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT),
+ !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
}
}
-multiclass DMR_BF16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
- string asmstr> {
+multiclass DMR_BF16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL,
+ string asmbase, string asmstr> {
defm NAME : DMR_BF16_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
- def PM#NAME :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, !or(xo, 0x11), (outs dmr:$AT),
- !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
- !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"@earlyclobber $AT">;
- def PM#NAME#PP :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, xo, (outs dmr:$AT),
- !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
- !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !or(xo, 0x11), (outs dmr:$AT),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, xo, (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
}
}
-multiclass DMR_F16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
- string asmstr> {
+multiclass DMR_F16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL,
+ string asmbase, string asmstr> {
defm NAME : DMR_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
- def PM#NAME :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, !or(xo, 0x01), (outs dmr:$AT),
- !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
- !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"@earlyclobber $AT">;
- def PM#NAME#PP :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, xo, (outs dmr:$AT),
- !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
- !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !or(xo, 0x01), (outs dmr:$AT),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, xo, (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
}
}
multiclass DMR_NEG_UM_M284_XOXORf939a0<bits<6> opcode, bits<8> xo, dag IOL,
- string asmbase, string asmstr> {
+ string asmbase, string asmstr> {
defm NAME : DMR_BF16_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>;
let Predicates = [MMA, IsISAFuture] in {
- def PN : XX3Form_AT3_XAp5B6<
- opcode, !xor(xo, 0xF9), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
- !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
- def NP : XX3Form_AT3_XAp5B6<
- opcode, !xor(xo, 0x39), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
- !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
- def NN : XX3Form_AT3_XAp5B6<
- opcode, !xor(xo, 0xA0), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
- !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PN
+ : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0xF9), (outs dmr:$AT),
+ !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
+ def NP
+ : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0x39), (outs dmr:$AT),
+ !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
+ def NN
+ : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0xA0), (outs dmr:$AT),
+ !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
- def PM#NAME#PN :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, !xor(xo, 0xF9), (outs dmr:$AT),
- !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
- !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
- def PM#NAME#NP :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, !xor(xo, 0x39), (outs dmr:$AT),
- !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
- !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
- def PM#NAME#NN :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, !xor(xo, 0xA0), (outs dmr:$AT),
- !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
- !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#PN
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !xor(xo, 0xF9), (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
+ def PM#NAME#NP
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !xor(xo, 0x39), (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
+ def PM#NAME#NN
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !xor(xo, 0xA0), (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
}
}
multiclass DMR_NEG_UM_M284_XOXORd11188<bits<6> opcode, bits<8> xo, dag IOL,
- string asmbase, string asmstr> {
+ string asmbase, string asmstr> {
defm NAME : DMR_F16_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>;
let Predicates = [MMA, IsISAFuture] in {
- def PN : XX3Form_AT3_XAp5B6<
- opcode, !xor(xo, 0xD1), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
- !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
- def NP : XX3Form_AT3_XAp5B6<
- opcode, !xor(xo, 0x11), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
- !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
- def NN : XX3Form_AT3_XAp5B6<
- opcode, !xor(xo, 0x88), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
- !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PN
+ : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0xD1), (outs dmr:$AT),
+ !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
+ def NP
+ : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0x11), (outs dmr:$AT),
+ !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
+ def NN
+ : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0x88), (outs dmr:$AT),
+ !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
- def PM#NAME#PN :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, !xor(xo, 0xD1), (outs dmr:$AT),
- !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
- !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
- def PM#NAME#NP :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, !xor(xo, 0x11), (outs dmr:$AT),
- !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
- !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
- def PM#NAME#NN :
- MMIRR_XX3Form_X8Y4P2_XAp5B6<
- opcode, !xor(xo, 0x88), (outs dmr:$AT),
- !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
- !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
- IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#PN
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !xor(xo, 0xD1), (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
+ def PM#NAME#NP
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !xor(xo, 0x11), (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
+ def PM#NAME#NN
+ : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+ opcode, !xor(xo, 0x88), (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">;
}
}
class XForm_AT3_T1_AB3<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL,
string asmstr, list<dag> pattern>
- : I <opcode, OOL, IOL, asmstr, NoItinerary> {
+ : I<opcode, OOL, IOL, asmstr, NoItinerary> {
bits<3> AT;
bits<3> AB;
bits<1> T;
let Pattern = pattern;
- let Inst{6-8} = AT{2-0};
+ let Inst{6...8} = AT{2...0};
let Inst{9} = 0;
let Inst{10} = T;
- let Inst{11-15} = o;
- let Inst{16-18} = AB{2-0};
- let Inst{19-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{11...15} = o;
+ let Inst{16...18} = AB{2...0};
+ let Inst{19...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
class XForm_ATp2_SR5<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL,
string asmstr, list<dag> pattern>
- : I <opcode, OOL, IOL, asmstr, NoItinerary> {
+ : I<opcode, OOL, IOL, asmstr, NoItinerary> {
bits<2> ATp;
bits<5> SR;
let Pattern = pattern;
- let Inst{6-7} = ATp{1-0};
- let Inst{8-10} = 0;
- let Inst{11-15} = o;
- let Inst{16-20} = SR{4-0};
- let Inst{21-30} = xo;
+ let Inst{6...7} = ATp{1...0};
+ let Inst{8...10} = 0;
+ let Inst{11...15} = o;
+ let Inst{16...20} = SR{4...0};
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
class XX2Form_AT3_XB6_ID2_E1_BL2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
- string asmstr, list<dag> pattern>
- : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+ string asmstr, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, NoItinerary> {
bits<3> AT;
bits<6> XB;
bits<2> ID;
@@ -379,41 +395,48 @@ class XX2Form_AT3_XB6_ID2_E1_BL2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = AT{2-0};
- let Inst{9-10} = 0;
- let Inst{11-12} = ID{1-0};
+ let Inst{6...8} = AT{2...0};
+ let Inst{9...10} = 0;
+ let Inst{11...12} = ID{1...0};
let Inst{13} = E;
- let Inst{14-15} = BL{1-0};
- let Inst{16-20} = XB{4-0};
- let Inst{21-29} = xo;
+ let Inst{14...15} = BL{1...0};
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...29} = xo;
let Inst{30} = XB{5};
let Inst{31} = 0;
}
-let Predicates = [IsISAFuture] in {
- def DMXXEXTFDMR512 : XX3Form_AT3_XABp5_P1<60, 226,
- (outs vsrprc:$XAp, vsrprc:$XBp),
- (ins wacc:$AT),
- "dmxxextfdmr512 $XAp, $XBp, $AT, 0", []> {
+//-------------------------- Instruction definitions -------------------------//
+// Predicate combinations available:
+// [MMA, IsISAFuture]
+// [MMA, PrefixInstrs, IsISAFuture]
+
+let Predicates = [MMA, IsISAFuture] in {
+ def DMXXEXTFDMR512
+ : XX3Form_AT3_XABp5_P1<60, 226, (outs vsrprc:$XAp, vsrprc:$XBp),
+ (ins wacc:$AT),
+ "dmxxextfdmr512 $XAp, $XBp, $AT, 0", []> {
let P = 0;
}
- def DMXXEXTFDMR512_HI : XX3Form_AT3_XABp5_P1<60, 226,
- (outs vsrprc:$XAp, vsrprc:$XBp),
- (ins wacc_hi:$AT),
- "dmxxextfdmr512 $XAp, $XBp, $AT, 1", []> {
+ def DMXXEXTFDMR512_HI
+ : XX3Form_AT3_XABp5_P1<60, 226, (outs vsrprc:$XAp, vsrprc:$XBp),
+ (ins wacc_hi:$AT),
+ "dmxxextfdmr512 $XAp, $XBp, $AT, 1", []> {
let P = 1;
}
- def DMXXINSTDMR512 : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc:$AT),
- (ins vsrprc:$XAp, vsrprc:$XBp),
- "dmxxinstdmr512 $AT, $XAp, $XBp, 0", []> {
+ def DMXXINSTDMR512
+ : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc:$AT),
+ (ins vsrprc:$XAp, vsrprc:$XBp),
+ "dmxxinstdmr512 $AT, $XAp, $XBp, 0", []> {
let P = 0;
}
- def DMXXINSTDMR512_HI : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc_hi:$AT),
- (ins vsrprc:$XAp, vsrprc:$XBp),
- "dmxxinstdmr512 $AT, $XAp, $XBp, 1", []> {
+ def DMXXINSTDMR512_HI
+ : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc_hi:$AT),
+ (ins vsrprc:$XAp, vsrprc:$XBp),
+ "dmxxinstdmr512 $AT, $XAp, $XBp, 1", []> {
let P = 1;
}
@@ -422,236 +445,220 @@ let Predicates = [IsISAFuture] in {
"dmxxextfdmr256 $XBp, $AT, $P", []>;
def DMXXINSTDMR256 : XX2Form_AT3_XBp5_P2<60, 485, (outs dmrrowp:$AT),
- (ins vsrprc:$XBp, u2imm:$P),
- "dmxxinstdmr256 $AT, $XBp, $P", []>;
+ (ins vsrprc:$XBp, u2imm:$P),
+ "dmxxinstdmr256 $AT, $XBp, $P", []>;
- def DMMR : XForm_ATB3<31, 6, 177, (outs dmr:$AT), (ins dmr:$AB),
- "dmmr $AT, $AB",
- [(set v1024i1:$AT, (int_ppc_mma_dmmr v1024i1:$AB))]>;
+ def DMMR
+ : XForm_ATB3<31, 6, 177, (outs dmr:$AT), (ins dmr:$AB), "dmmr $AT, $AB",
+ [(set v1024i1:$AT, (int_ppc_mma_dmmr v1024i1:$AB))]>;
def DMXOR : XForm_ATB3<31, 7, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB),
"dmxor $AT, $AB",
- [(set v1024i1:$AT, (int_ppc_mma_dmxor v1024i1:$ATi, v1024i1:$AB))]>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-
- def DMSETDMRZ : XForm_AT3<31, 2, 177, (outs dmr:$AT), (ins),
- "dmsetdmrz $AT", NoItinerary,
- [(set v1024i1:$AT, (int_ppc_mma_dmsetdmrz))]>;
-}
-
-// MMA+ accumulating/non-accumulating instructions.
-
-// DMXVI8GERX4, DMXVI8GERX4PP, PMDMXVI8GERX4, PMDMXVI8GERX4PP
-defm DMXVI8GERX4 : DMR_UM_M448_XOEO<59, 10, (ins vsrprc:$XAp, vsrc:$XB),
- "dmxvi8gerx4", "$AT, $XAp, $XB">;
-
-let Predicates = [MMA, IsISAFuture] in {
- def DMXVI8GERX4SPP :
- XX3Form_AT3_XAp5B6<59, 98, (outs dmr:$AT), (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB),
- "dmxvi8gerx4spp $AT, $XAp, $XB", IIC_VecGeneral, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-}
+ [(set v1024i1:$AT, (int_ppc_mma_dmxor v1024i1:$ATi,
+ v1024i1:$AB))]>,
+ RegConstraint<"$ATi = $AT">;
+
+ def DMSETDMRZ
+ : XForm_AT3<31, 2, 177, (outs dmr:$AT), (ins), "dmsetdmrz $AT",
+ NoItinerary, [(set v1024i1:$AT, (int_ppc_mma_dmsetdmrz))]>;
+
+ // DMXVI8GERX4, DMXVI8GERX4PP, PMDMXVI8GERX4, PMDMXVI8GERX4PP
+ defm DMXVI8GERX4 : DMR_UM_M448_XOEO<59, 10, (ins vsrprc:$XAp, vsrc:$XB),
+ "dmxvi8gerx4", "$AT, $XAp, $XB">;
+
+ // DMXVBF16GERX2, DMXVBF16GERX2PP, DMXVBF16GERX2PN, dMXVBF16GERX2NP,
+ // DMXVBF16GERX2NN PMDMXVBF16GERX2, PMDMXVBF16GERX2PP, PMDMXVBF16GERX2PN,
+ // PMDMXVBF16GERX2NP, PMDMXVBF16GERX2NN
+ defm DMXVBF16GERX2
+ : DMR_NEG_UM_M284_XOXORf939a0<59, 74, (ins vsrprc:$XAp, vsrc:$XB),
+ "dmxvbf16gerx2", "$AT, $XAp, $XB">;
+
+ // DMXVF16GERX2, DMXVF16GERX2PP, DMXVF16GERX2PN, dMXVF16GERX2NP,
+ // DMXVF16GERX2NN PMDMXVF16GERX2, PMDMXVF16GERX2PP, PMDMXVF16GERX2PN,
+ // PMDMXVF16GERX2NP, PMDMXVF16GERX2NN
+ defm DMXVF16GERX2
+ : DMR_NEG_UM_M284_XOXORd11188<59, 66, (ins vsrprc:$XAp, vsrc:$XB),
+ "dmxvf16gerx2", "$AT, $XAp, $XB">;
+
+ // DMF cryptography [support] Instructions
+ def DMSHA2HASH
+ : XForm_AT3_T1_AB3<
+ 31, 14, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB, u1imm:$T),
+ "dmsha2hash $AT, $AB, $T",
+ [(set v1024i1:$AT, (int_ppc_mma_dmsha2hash v1024i1:$ATi,
+ v1024i1:$AB, timm:$T))]>,
+ RegConstraint<"$ATi = $AT">;
+ def DMSHA3HASH
+ : XForm_ATp2_SR5<31, 15, 177, (outs dmrp:$ATp),
+ (ins dmrp:$ATpi, u5imm:$SR), "dmsha3hash $ATp, $SR",
+ [(set v2048i1:$ATp,
+ (int_ppc_mma_dmsha3hash v2048i1:$ATpi, timm:$SR))]>,
+ RegConstraint<"$ATpi = $ATp">;
+ def DMXXSHAPAD
+ : XX2Form_AT3_XB6_ID2_E1_BL2<60, 421, (outs dmr:$AT),
+ (ins dmr:$ATi, vsrc:$XB, u2imm:$ID, u1imm:$E,
+ u2imm:$BL),
+ "dmxxshapad $AT, $XB, $ID, $E, $BL", []>,
+ RegConstraint<"$ATi = $AT">;
+
+ // MMA+ accumulating/non-accumulating instructions.
+ def DMXVI8GERX4SPP
+ : XX3Form_AT3_XAp5B6<59, 98, (outs dmr:$AT),
+ (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB),
+ "dmxvi8gerx4spp $AT, $XAp, $XB", IIC_VecGeneral, []>,
+ RegConstraint<"$ATi = $AT">;
+
+} // End of [MMA, IsISAFuture]
let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
- def PMDMXVI8GERX4SPP :
- MMIRR_XX3Form_X8YP4_XAp5B6<59, 98, (outs dmr:$AT),
- (ins dmr:$ATi, vsrprc:$XAp,vsrc:$XB, u8imm:$XMSK,
- u4imm:$YMSK, u4imm:$PMSK),
- "pmdmxvi8gerx4spp $AT, $XAp, $XB, $XMSK, $YMSK, $PMSK",
- IIC_VecGeneral, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PMDMXVI8GERX4SPP
+ : MMIRR_XX3Form_X8YP4_XAp5B6<
+ 59, 98, (outs dmr:$AT),
+ (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB, u8imm:$XMSK, u4imm:$YMSK,
+ u4imm:$PMSK),
+ "pmdmxvi8gerx4spp $AT, $XAp, $XB, $XMSK, $YMSK, $PMSK",
+ IIC_VecGeneral, []>,
+ RegConstraint<"$ATi = $AT">;
}
-// DMXVBF16GERX2, DMXVBF16GERX2PP, DMXVBF16GERX2PN, dMXVBF16GERX2NP, DMXVBF16GERX2NN
-// PMDMXVBF16GERX2, PMDMXVBF16GERX2PP, PMDMXVBF16GERX2PN, PMDMXVBF16GERX2NP, PMDMXVBF16GERX2NN
-defm DMXVBF16GERX2 : DMR_NEG_UM_M284_XOXORf939a0<59, 74, (ins vsrprc:$XAp, vsrc:$XB),
- "dmxvbf16gerx2", "$AT, $XAp, $XB">;
-
-// DMXVF16GERX2, DMXVF16GERX2PP, DMXVF16GERX2PN, dMXVF16GERX2NP, DMXVF16GERX2NN
-// PMDMXVF16GERX2, PMDMXVF16GERX2PP, PMDMXVF16GERX2PN, PMDMXVF16GERX2NP, PMDMXVF16GERX2NN
-defm DMXVF16GERX2 : DMR_NEG_UM_M284_XOXORd11188<59, 66, (ins vsrprc:$XAp, vsrc:$XB),
- "dmxvf16gerx2", "$AT, $XAp, $XB">;
-
-// DMF cryptography [support] Instructions
-let Predicates = [IsISAFuture] in {
- def DMSHA2HASH :
- XForm_AT3_T1_AB3<31, 14, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB, u1imm:$T),
- "dmsha2hash $AT, $AB, $T",
- [(set v1024i1:$AT, (int_ppc_mma_dmsha2hash v1024i1:$ATi, v1024i1:$AB, timm:$T))]>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-
- def DMSHA3HASH :
- XForm_ATp2_SR5<31, 15, 177, (outs dmrp:$ATp), (ins dmrp:$ATpi , u5imm:$SR),
- "dmsha3hash $ATp, $SR",
- [(set v2048i1:$ATp, (int_ppc_mma_dmsha3hash v2048i1:$ATpi, timm:$SR))]>,
- RegConstraint<"$ATpi = $ATp">, NoEncode<"$ATpi">;
-
- def DMXXSHAPAD :
- XX2Form_AT3_XB6_ID2_E1_BL2<60, 421, (outs dmr:$AT),
- (ins dmr:$ATi, vsrc:$XB, u2imm:$ID, u1imm:$E, u2imm:$BL),
- "dmxxshapad $AT, $XB, $ID, $E, $BL", []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-}
+//---------------------------- Anonymous Patterns ----------------------------//
+// Predicate combinations available:
+// [MMA, IsISAFuture]
+// [MMA, PrefixInstrs, IsISAFuture]
-// MMA+ Intrinsics
let Predicates = [MMA, IsISAFuture] in {
+ // MMA+ Intrinsics
def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4 v256i1:$XAp, v16i8:$XB)),
(DMXVI8GERX4 $XAp, RCCp.BToVSRC)>;
- def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVI8GERX4PP $ATi, $XAp, RCCp.BToVSRC)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC)>;
-
def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2 v256i1:$XAp, v16i8:$XB)),
(DMXVBF16GERX2 $XAp, RCCp.BToVSRC)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>;
-
def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2 v256i1:$XAp, v16i8:$XB)),
(DMXVF16GERX2 $XAp, RCCp.BToVSRC)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVF16GERX2PP $ATi, $XAp, RCCp.BToVSRC)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVF16GERX2PN $ATi, $XAp, RCCp.BToVSRC)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVF16GERX2NP $ATi, $XAp, RCCp.BToVSRC)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB)),
(DMXVF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>;
+
+ // Cryptography Intrinsic
+ def : Pat<(v1024i1 (int_ppc_mma_dmxxshapad v1024i1:$ATi, v16i8:$XB, timm:$ID,
+ timm:$E, timm:$BL)),
+ (DMXXSHAPAD $ATi, RCCp.BToVSRC, $ID, $E, $BL)>;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
- (PMDMXVI8GERX4 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
-
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk4Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4 v256i1:$XAp, v16i8:$XB,
+ Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
+ (PMDMXVI8GERX4 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk4Imm:$PMSK)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
(PMDMXVI8GERX4PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+ Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk4Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
(PMDMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+ Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
- (PMDMXVBF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2 v256i1:$XAp, v16i8:$XB,
+ Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+ (PMDMXVBF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk2Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMDMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk2Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMDMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk2Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMDMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk2Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMDMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
- (PMDMXVF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2 v256i1:$XAp, v16i8:$XB,
+ Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+ (PMDMXVF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk2Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMDMXVF16GERX2PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk2Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMDMXVF16GERX2PN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk2Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMDMXVF16GERX2NP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
- def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
- Msk8Imm:$XMSK, Msk4Imm:$YMSK,
- Msk2Imm:$PMSK)),
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp,
+ v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
(PMDMXVF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
- Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-}
-
-// Cryptography Intrinsic
-let Predicates = [IsISAFuture] in {
- def : Pat<(v1024i1 (int_ppc_mma_dmxxshapad v1024i1:$ATi, v16i8:$XB, timm:$ID,
- timm:$E, timm:$BL)), (DMXXSHAPAD $ATi, RCCp.BToVSRC, $ID, $E, $BL)>;
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
}
-// MMA+ Instruction aliases
-let Predicates = [IsISAFuture] in {
- def : InstAlias<"dmsha256hash $AT, $AB",
- (DMSHA2HASH dmr:$AT, dmr:$AB, 0)>;
+//---------------------------- Instruction aliases ---------------------------//
- def : InstAlias<"dmsha512hash $AT, $AB",
- (DMSHA2HASH dmr:$AT, dmr:$AB, 1)>;
-
- def : InstAlias<"dmsha3dw $ATp",
- (DMSHA3HASH dmrp:$ATp, 0)>;
-
- def : InstAlias<"dmcryshash $ATp",
- (DMSHA3HASH dmrp:$ATp, 12)>;
-
- def : InstAlias<"dmxxsha3512pad $AT, $XB, $E",
- (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 0)>;
-
- def : InstAlias<"dmxxsha3384pad $AT, $XB, $E",
- (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 1)>;
-
- def : InstAlias<"dmxxsha3256pad $AT, $XB, $E",
- (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 2)>;
-
- def : InstAlias<"dmxxsha3224pad $AT, $XB, $E",
- (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 3)>;
-
- def : InstAlias<"dmxxshake256pad $AT, $XB, $E",
- (DMXXSHAPAD dmr:$AT, vsrc:$XB, 1, u1imm:$E, 0)>;
-
- def : InstAlias<"dmxxshake128pad $AT, $XB, $E",
- (DMXXSHAPAD dmr:$AT, vsrc:$XB, 1, u1imm:$E, 1)>;
-
- def : InstAlias<"dmxxsha384512pad $AT, $XB",
- (DMXXSHAPAD dmr:$AT, vsrc:$XB, 2, 0, 0)>;
-
- def : InstAlias<"dmxxsha224256pad $AT, $XB",
- (DMXXSHAPAD dmr:$AT, vsrc:$XB, 3, 0, 0)>;
+let Predicates = [MMA, IsISAFuture] in {
+ def : InstAlias<"dmsha256hash $AT, $AB", (DMSHA2HASH dmr:$AT, dmr:$AB, 0)>;
+ def : InstAlias<"dmsha512hash $AT, $AB", (DMSHA2HASH dmr:$AT, dmr:$AB, 1)>;
+ def : InstAlias<"dmsha3dw $ATp", (DMSHA3HASH dmrp:$ATp, 0)>;
+ def : InstAlias<"dmcryshash $ATp", (DMSHA3HASH dmrp:$ATp, 12)>;
+ def : InstAlias<"dmxxsha3512pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+ 0, u1imm:$E, 0)>;
+ def : InstAlias<"dmxxsha3384pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+ 0, u1imm:$E, 1)>;
+ def : InstAlias<"dmxxsha3256pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+ 0, u1imm:$E, 2)>;
+ def : InstAlias<"dmxxsha3224pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+ 0, u1imm:$E, 3)>;
+ def : InstAlias<"dmxxshake256pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+ 1, u1imm:$E, 0)>;
+ def : InstAlias<"dmxxshake128pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+ 1, u1imm:$E, 1)>;
+ def : InstAlias<"dmxxsha384512pad $AT, $XB", (DMXXSHAPAD dmr:$AT, vsrc:$XB, 2,
+ 0, 0)>;
+ def : InstAlias<"dmxxsha224256pad $AT, $XB", (DMXXSHAPAD dmr:$AT, vsrc:$XB, 3,
+ 0, 0)>;
}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 7c1550e99bae..db066bc4b7bd 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/StackMaps.h"
@@ -87,8 +88,8 @@ static cl::opt<bool> EnableFMARegPressureReduction(
// Pin the vtable to this file.
void PPCInstrInfo::anchor() {}
-PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
- : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
+PPCInstrInfo::PPCInstrInfo(const PPCSubtarget &STI)
+ : PPCGenInstrInfo(STI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
/* CatchRetOpcode */ -1,
STI.isPPC64() ? PPC::BLR8 : PPC::BLR),
Subtarget(STI), RI(STI.getTargetMachine()) {}
@@ -1863,6 +1864,48 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addReg(SrcRegSub1)
.addReg(SrcRegSub1, getKillRegState(KillSrc));
return;
+ } else if ((PPC::WACCRCRegClass.contains(DestReg) ||
+ PPC::WACC_HIRCRegClass.contains(DestReg)) &&
+ (PPC::WACCRCRegClass.contains(SrcReg) ||
+ PPC::WACC_HIRCRegClass.contains(SrcReg))) {
+
+ Opc = PPC::WACCRCRegClass.contains(SrcReg) ? PPC::DMXXEXTFDMR512
+ : PPC::DMXXEXTFDMR512_HI;
+
+ RegScavenger RS;
+ RS.enterBasicBlockEnd(MBB);
+ RS.backward(std::next(I));
+
+ Register TmpReg1 = RS.scavengeRegisterBackwards(PPC::VSRpRCRegClass, I,
+ /* RestoreAfter */ false, 0,
+ /* AllowSpill */ false);
+
+ RS.setRegUsed(TmpReg1);
+ Register TmpReg2 = RS.scavengeRegisterBackwards(PPC::VSRpRCRegClass, I,
+ /* RestoreAfter */ false, 0,
+ /* AllowSpill */ false);
+
+ BuildMI(MBB, I, DL, get(Opc))
+ .addReg(TmpReg1, RegState::Define)
+ .addReg(TmpReg2, RegState::Define)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+
+ Opc = PPC::WACCRCRegClass.contains(DestReg) ? PPC::DMXXINSTDMR512
+ : PPC::DMXXINSTDMR512_HI;
+
+ BuildMI(MBB, I, DL, get(Opc), DestReg)
+ .addReg(TmpReg1, RegState::Kill)
+ .addReg(TmpReg2, RegState::Kill);
+
+ return;
+ } else if (PPC::DMRRCRegClass.contains(DestReg) &&
+ PPC::DMRRCRegClass.contains(SrcReg)) {
+
+ BuildMI(MBB, I, DL, get(PPC::DMMR), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+
+ return;
+
} else
llvm_unreachable("Impossible reg-to-reg copy");
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 7931a9e3ae13..63ebd6591057 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -279,7 +279,7 @@ enum PPCMachineCombinerPattern : unsigned {
class PPCSubtarget;
class PPCInstrInfo : public PPCGenInstrInfo {
- PPCSubtarget &Subtarget;
+ const PPCSubtarget &Subtarget;
const PPCRegisterInfo RI;
const unsigned StoreSpillOpcodesArray[4][SOK_LastOpcodeSpill] =
StoreOpcodesForSpill;
@@ -369,7 +369,7 @@ protected:
unsigned OpIdx2) const override;
public:
- explicit PPCInstrInfo(PPCSubtarget &STI);
+ explicit PPCInstrInfo(const PPCSubtarget &STI);
bool isLoadFromConstantPool(MachineInstr *I) const;
const Constant *getConstantFromConstantPool(MachineInstr *I) const;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index c2f91ce8e6b9..c12cf8511312 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -58,6 +58,10 @@ def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
]>;
+def SDT_PPCVecShiftQuad : SDTypeProfile<1, 2, [
+ SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>
+]>;
+
def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
]>;
@@ -157,6 +161,8 @@ def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+def PPCvsrq: SDNode<"PPCISD::VSRQ", SDT_PPCVecShiftQuad, []>;
+
def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
SDTFPUnaryOp, [SDNPHasChain]>;
def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
@@ -665,9 +671,6 @@ class isRecordForm { bit RC = 1; }
class RegConstraint<string C> {
string Constraints = C;
}
-class NoEncode<string E> {
- string DisableEncoding = E;
-}
// Define PowerPC specific addressing mode.
@@ -1989,29 +1992,24 @@ def LBZU : DForm_1<35, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D,
def LHAU : DForm_1<43, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
"lhau $RST, $addr", IIC_LdStLHAU,
- []>, RegConstraint<"$addr.reg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">;
def LHZU : DForm_1<41, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
"lhzu $RST, $addr", IIC_LdStLoadUpd,
- []>, RegConstraint<"$addr.reg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">;
def LWZU : DForm_1<33, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
"lwzu $RST, $addr", IIC_LdStLoadUpd,
- []>, RegConstraint<"$addr.reg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">;
let Predicates = [HasFPU] in {
def LFSU : DForm_1<49, (outs f4rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
"lfsu $RST, $addr", IIC_LdStLFDU,
- []>, RegConstraint<"$addr.reg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">;
def LFDU : DForm_1<51, (outs f8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
"lfdu $RST, $addr", IIC_LdStLFDU,
- []>, RegConstraint<"$addr.reg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.reg = $ea_result">;
}
@@ -2019,39 +2017,33 @@ def LFDU : DForm_1<51, (outs f8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D,
def LBZUX : XForm_1_memOp<31, 119, (outs gprc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lbzux $RST, $addr", IIC_LdStLoadUpdX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
def LHAUX : XForm_1_memOp<31, 375, (outs gprc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lhaux $RST, $addr", IIC_LdStLHAUX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
def LHZUX : XForm_1_memOp<31, 311, (outs gprc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lhzux $RST, $addr", IIC_LdStLoadUpdX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
def LWZUX : XForm_1_memOp<31, 55, (outs gprc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lwzux $RST, $addr", IIC_LdStLoadUpdX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
let Predicates = [HasFPU] in {
def LFSUX : XForm_1_memOp<31, 567, (outs f4rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lfsux $RST, $addr", IIC_LdStLFDUX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
def LFDUX : XForm_1_memOp<31, 631, (outs f8rc:$RST, ptr_rc_nor0:$ea_result),
(ins (memrr $RA, $RB):$addr),
"lfdux $RST, $addr", IIC_LdStLFDUX,
- []>, RegConstraint<"$addr.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
+ []>, RegConstraint<"$addr.ptrreg = $ea_result">;
}
}
}
@@ -2132,20 +2124,20 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$RST, (memri $D, $RA):$dst),
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
def STBU : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memri $D, $RA):$dst),
"stbu $RST, $dst", IIC_LdStSTU, []>,
- RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+ RegConstraint<"$dst.reg = $ea_res">;
def STHU : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memri $D, $RA):$dst),
"sthu $RST, $dst", IIC_LdStSTU, []>,
- RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+ RegConstraint<"$dst.reg = $ea_res">;
def STWU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memri $D, $RA):$dst),
"stwu $RST, $dst", IIC_LdStSTU, []>,
- RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+ RegConstraint<"$dst.reg = $ea_res">;
let Predicates = [HasFPU] in {
def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$RST, (memri $D, $RA):$dst),
"stfsu $RST, $dst", IIC_LdStSTFDU, []>,
- RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+ RegConstraint<"$dst.reg = $ea_res">;
def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$RST, (memri $D, $RA):$dst),
"stfdu $RST, $dst", IIC_LdStSTFDU, []>,
- RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+ RegConstraint<"$dst.reg = $ea_res">;
}
}
@@ -2207,32 +2199,27 @@ def STBUX : XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
(ins gprc:$RST, (memrr $RA, $RB):$addr),
"stbux $RST, $addr", IIC_LdStSTUX, []>,
RegConstraint<"$addr.ptrreg = $ea_res">,
- NoEncode<"$ea_res">,
PPC970_DGroup_Cracked;
def STHUX : XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
(ins gprc:$RST, (memrr $RA, $RB):$addr),
"sthux $RST, $addr", IIC_LdStSTUX, []>,
RegConstraint<"$addr.ptrreg = $ea_res">,
- NoEncode<"$ea_res">,
PPC970_DGroup_Cracked;
def STWUX : XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
(ins gprc:$RST, (memrr $RA, $RB):$addr),
"stwux $RST, $addr", IIC_LdStSTUX, []>,
RegConstraint<"$addr.ptrreg = $ea_res">,
- NoEncode<"$ea_res">,
PPC970_DGroup_Cracked;
let Predicates = [HasFPU] in {
def STFSUX: XForm_8_memOp<31, 695, (outs ptr_rc_nor0:$ea_res),
(ins f4rc:$RST, (memrr $RA, $RB):$addr),
"stfsux $RST, $addr", IIC_LdStSTFDU, []>,
RegConstraint<"$addr.ptrreg = $ea_res">,
- NoEncode<"$ea_res">,
PPC970_DGroup_Cracked;
def STFDUX: XForm_8_memOp<31, 759, (outs ptr_rc_nor0:$ea_res),
(ins f8rc:$RST, (memrr $RA, $RB):$addr),
"stfdux $RST, $addr", IIC_LdStSTFDU, []>,
RegConstraint<"$addr.ptrreg = $ea_res">,
- NoEncode<"$ea_res">,
PPC970_DGroup_Cracked;
}
}
@@ -3099,7 +3086,7 @@ defm RLWIMI : MForm_2r<20, (outs gprc:$RA),
(ins gprc:$RAi, gprc:$RS, u5imm:$SH, u5imm:$MB,
u5imm:$ME), "rlwimi", "$RA, $RS, $SH, $MB, $ME",
IIC_IntRotate, []>, PPC970_DGroup_Cracked,
- RegConstraint<"$RAi = $RA">, NoEncode<"$RAi">;
+ RegConstraint<"$RAi = $RA">;
}
let BaseName = "rlwinm" in {
def RLWINM : MForm_2<21,
@@ -3235,9 +3222,10 @@ def PPC32GOT: PPCEmitTimePseudo<(outs gprc:$rD), (ins), "#PPC32GOT",
// Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
// This uses two output registers, the first as the real output, the second as a
-// temporary register, used internally in code generation.
+// temporary register, used internally in code generation. A "bl" also clobbers LR.
+let Defs = [LR] in
def PPC32PICGOT: PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT",
- []>, NoEncode<"$rT">;
+ []>;
def LDgotTprelL32: PPCEmitTimePseudo<(outs gprc_nor0:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
"#LDgotTprelL32",
@@ -4287,7 +4275,7 @@ def WRTEEI: I<31, (outs), (ins i1imm:$E), "wrteei $E", IIC_SprMTMSR>,
bits<1> E;
let Inst{16} = E;
- let Inst{21-30} = 163;
+ let Inst{21...30} = 163;
}
def DCCCI : XForm_tlb<454, (outs), (ins gprc:$RA, gprc:$RB),
@@ -4967,44 +4955,44 @@ defm : BranchSimpleMnemonic1<"dzf", "", 2>;
multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
def : InstAlias<"b"#name#pm#" $cc, $dst",
- (BCC bibo, crrc:$cc, condbrtarget:$dst)>;
+ (BCC (pred bibo, crrc:$cc), condbrtarget:$dst)>;
def : InstAlias<"b"#name#pm#" $dst",
- (BCC bibo, CR0, condbrtarget:$dst)>;
+ (BCC (pred bibo, CR0), condbrtarget:$dst)>;
def : InstAlias<"b"#name#"a"#pm#" $cc, $dst",
- (BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+ (BCCA (pred bibo, crrc:$cc), abscondbrtarget:$dst)>;
def : InstAlias<"b"#name#"a"#pm#" $dst",
- (BCCA bibo, CR0, abscondbrtarget:$dst)>;
+ (BCCA (pred bibo, CR0), abscondbrtarget:$dst)>;
def : InstAlias<"b"#name#"lr"#pm#" $cc",
- (BCCLR bibo, crrc:$cc)>;
+ (BCCLR (pred bibo, crrc:$cc))>;
def : InstAlias<"b"#name#"lr"#pm,
- (BCCLR bibo, CR0)>;
+ (BCCLR (pred bibo, CR0))>;
def : InstAlias<"b"#name#"ctr"#pm#" $cc",
- (BCCCTR bibo, crrc:$cc)>;
+ (BCCCTR (pred bibo, crrc:$cc))>;
def : InstAlias<"b"#name#"ctr"#pm,
- (BCCCTR bibo, CR0)>;
+ (BCCCTR (pred bibo, CR0))>;
def : InstAlias<"b"#name#"l"#pm#" $cc, $dst",
- (BCCL bibo, crrc:$cc, condbrtarget:$dst)>;
+ (BCCL (pred bibo, crrc:$cc), condbrtarget:$dst)>;
def : InstAlias<"b"#name#"l"#pm#" $dst",
- (BCCL bibo, CR0, condbrtarget:$dst)>;
+ (BCCL (pred bibo, CR0), condbrtarget:$dst)>;
def : InstAlias<"b"#name#"la"#pm#" $cc, $dst",
- (BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+ (BCCLA (pred bibo, crrc:$cc), abscondbrtarget:$dst)>;
def : InstAlias<"b"#name#"la"#pm#" $dst",
- (BCCLA bibo, CR0, abscondbrtarget:$dst)>;
+ (BCCLA (pred bibo, CR0), abscondbrtarget:$dst)>;
def : InstAlias<"b"#name#"lrl"#pm#" $cc",
- (BCCLRL bibo, crrc:$cc)>;
+ (BCCLRL (pred bibo, crrc:$cc))>;
def : InstAlias<"b"#name#"lrl"#pm,
- (BCCLRL bibo, CR0)>;
+ (BCCLRL (pred bibo, CR0))>;
def : InstAlias<"b"#name#"ctrl"#pm#" $cc",
- (BCCCTRL bibo, crrc:$cc)>;
+ (BCCCTRL (pred bibo, crrc:$cc))>;
def : InstAlias<"b"#name#"ctrl"#pm,
- (BCCCTRL bibo, CR0)>;
+ (BCCCTRL (pred bibo, CR0))>;
}
multiclass BranchExtendedMnemonic<string name, int bibo> {
defm : BranchExtendedMnemonicPM<name, "", bibo>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
index 436715a0e4ab..b38dd4ae948c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
@@ -14,7 +14,7 @@ multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
def PP :
XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), !con((ins acc:$ATi), IOL),
!strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
def NAME#W :
@@ -24,7 +24,7 @@ multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
def WPP :
XX3Form_AT3_XAB6<opcode, xo, (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
!strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -48,7 +48,7 @@ multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
!con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
def PM#NAME#W :
@@ -65,7 +65,7 @@ multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
!con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -89,7 +89,7 @@ multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
!con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
def PM#NAME#W :
@@ -106,7 +106,7 @@ multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
!con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -129,7 +129,7 @@ multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
def PM#NAME#W :
@@ -145,7 +145,7 @@ multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -162,7 +162,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
XX3Form_AT3_XAB6<
opcode, !or(xo, 0x20), (outs acc:$AT), !con((ins acc:$ATi), IOL),
!strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in {
def PM#NAME :
@@ -179,7 +179,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
!con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
def NAME#W :
@@ -190,7 +190,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
XX3Form_AT3_XAB6<
opcode, !or(xo, 0x20), (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
!strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
def PM#NAME#W :
@@ -207,7 +207,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
!con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -220,29 +220,29 @@ multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
def PN : XX3Form_AT3_XAB6<
opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL),
!strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def NP : XX3Form_AT3_XAB6<
opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL),
!strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def NN : XX3Form_AT3_XAB6<
opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL),
!strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
def WPN : XX3Form_AT3_XAB6<
opcode, !or(xo, 0x80), (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
!strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def WNP : XX3Form_AT3_XAB6<
opcode, !or(xo, 0x40), (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
!strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def WNN : XX3Form_AT3_XAB6<
opcode, !or(xo, 0xC0), (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
!strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in {
def PM#NAME#PN :
@@ -251,21 +251,21 @@ multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#NP :
MMIRR_XX3Form_XY4P2_XAB6<
opcode, !or(xo, 0x40), (outs acc:$AT),
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#NN :
MMIRR_XX3Form_XY4P2_XAB6<
opcode, !or(xo, 0xC0), (outs acc:$AT),
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
def PM#NAME#WPN :
@@ -274,21 +274,21 @@ multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#WNP :
MMIRR_XX3Form_XY4P2_XAB6<
opcode, !or(xo, 0x40), (outs wacc:$AT),
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#WNN :
MMIRR_XX3Form_XY4P2_XAB6<
opcode, !or(xo, 0xC0), (outs wacc:$AT),
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
!strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -301,29 +301,29 @@ multiclass ACC_NEG_UM_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
def PN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs acc:$AT),
!con((ins acc:$ATi), IOL),
!strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def NP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs acc:$AT),
!con((ins acc:$ATi), IOL),
!strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def NN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs acc:$AT),
!con((ins acc:$ATi), IOL),
!strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
def WPN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs wacc:$AT),
!con((ins wacc:$ATi), IOL),
!strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def WNP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs wacc:$AT),
!con((ins wacc:$ATi), IOL),
!strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def WNN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs wacc:$AT),
!con((ins wacc:$ATi), IOL),
!strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -346,28 +346,28 @@ multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#PN :
MMIRR_XX3Form_XY4_XAB6<
opcode, !or(xo, 0x80), (outs acc:$AT),
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
!strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#NP :
MMIRR_XX3Form_XY4_XAB6<
opcode, !or(xo, 0x40), (outs acc:$AT),
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
!strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#NN :
MMIRR_XX3Form_XY4_XAB6<
opcode, !or(xo, 0xC0), (outs acc:$AT),
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
!strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
def PM#NAME#W :
@@ -383,28 +383,28 @@ multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#WPN :
MMIRR_XX3Form_XY4_XAB6<
opcode, !or(xo, 0x80), (outs wacc:$AT),
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
!strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#WNP :
MMIRR_XX3Form_XY4_XAB6<
opcode, !or(xo, 0x40), (outs wacc:$AT),
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
!strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#WNN :
MMIRR_XX3Form_XY4_XAB6<
opcode, !or(xo, 0xC0), (outs wacc:$AT),
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
!strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -427,28 +427,28 @@ multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#PN :
MMIRR_XX3Form_X4Y2_XAB6<
opcode, !or(xo, 0x80), (outs acc:$AT),
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
!strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#NP :
MMIRR_XX3Form_X4Y2_XAB6<
opcode, !or(xo, 0x40), (outs acc:$AT),
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
!strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#NN :
MMIRR_XX3Form_X4Y2_XAB6<
opcode, !or(xo, 0xC0), (outs acc:$AT),
!con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
!strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
def PM#NAME#W :
@@ -464,28 +464,28 @@ multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
!strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#WPN :
MMIRR_XX3Form_X4Y2_XAB6<
opcode, !or(xo, 0x80), (outs wacc:$AT),
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
!strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#WNP :
MMIRR_XX3Form_X4Y2_XAB6<
opcode, !or(xo, 0x40), (outs wacc:$AT),
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
!strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def PM#NAME#WNN :
MMIRR_XX3Form_X4Y2_XAB6<
opcode, !or(xo, 0xC0), (outs wacc:$AT),
!con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
!strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
IIC_VecFP, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
}
@@ -497,12 +497,12 @@ let Predicates = [MMA, IsNotISAFuture] in {
XForm_AT3<31, 0, 177, (outs acc:$ATo), (ins acc:$AT), "xxmfacc $AT",
IIC_VecGeneral,
[(set v512i1:$ATo, (int_ppc_mma_xxmfacc v512i1:$AT))]>,
- RegConstraint<"$ATo = $AT">, NoEncode<"$ATo">;
+ RegConstraint<"$ATo = $AT">;
def XXMTACC :
XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT",
IIC_VecGeneral,
[(set v512i1:$AT, (int_ppc_mma_xxmtacc v512i1:$ATi))]>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp),
"#KILL_PAIR", []>,
RegConstraint<"$XTp = $XSp">;
@@ -519,7 +519,7 @@ let Predicates = [MMA, IsNotISAFuture] in {
def XVI8GER4SPP :
XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB),
"xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
let mayStore = 1 in {
def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst),
"#SPILL_ACC", []>;
@@ -544,11 +544,11 @@ let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
def XXMFACCW :
XForm_AT3<31, 0, 177, (outs wacc:$ATo), (ins wacc:$AT), "xxmfacc $AT",
IIC_VecGeneral, []>,
- RegConstraint<"$ATo = $AT">, NoEncode<"$ATo">;
+ RegConstraint<"$ATo = $AT">;
def XXMTACCW :
XForm_AT3<31, 1, 177, (outs wacc:$AT), (ins wacc:$ATi), "xxmtacc $AT",
IIC_VecGeneral, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
def DMXXSETACCZ :
@@ -560,7 +560,7 @@ let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
XX3Form_AT3_XAB6<59, 99, (outs wacc:$AT),
(ins wacc:$ATi, vsrc:$XA, vsrc:$XB),
"xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
let mayStore = 1 in {
def SPILL_WACC: PPCEmitTimePseudo<(outs), (ins wacc:$AT, memrix16:$dst),
@@ -593,7 +593,7 @@ let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in {
u4imm:$YMSK, u4imm:$PMSK),
"pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK",
IIC_VecGeneral, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
@@ -603,7 +603,7 @@ let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
u4imm:$YMSK, u4imm:$PMSK),
"pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK",
IIC_VecGeneral, []>,
- RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ RegConstraint<"$ATi = $AT">;
}
// MMA accumulating/non-accumulating instructions.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index c4a027d65b66..149a44ddfc10 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -125,8 +125,8 @@ class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
let InOperandList = IOL;
let AsmString = asmstr;
let Itinerary = itin;
- let Inst{0-5} = pref;
- let Inst{32-37} = opcode;
+ let Inst{0...5} = pref;
+ let Inst{32...37} = opcode;
bits<1> PPC970_First = 0;
bits<1> PPC970_Single = 0;
@@ -138,7 +138,7 @@ class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
let TSFlags{0} = PPC970_First;
let TSFlags{1} = PPC970_Single;
let TSFlags{2} = PPC970_Cracked;
- let TSFlags{5-3} = PPC970_Unit;
+ let TSFlags{5...3} = PPC970_Unit;
bits<1> Prefixed = 1; // This is a prefixed instruction.
let TSFlags{7} = Prefixed;
@@ -167,11 +167,11 @@ class VXForm_VTB5_RC<bits<10> xo, bits<5> R, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VT;
- let Inst{11-15} = R;
- let Inst{16-20} = VB;
+ let Inst{6...10} = VT;
+ let Inst{11...15} = R;
+ let Inst{16...20} = VB;
let Inst{21} = RC;
- let Inst{22-31} = xo;
+ let Inst{22...31} = xo;
}
// Multiclass definition to account for record and non-record form
@@ -200,16 +200,16 @@ class MLS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 2;
- let Inst{8-10} = 0;
+ let Inst{6...7} = 2;
+ let Inst{8...10} = 0;
let Inst{11} = PCRel;
- let Inst{12-13} = 0;
- let Inst{14-31} = D{33-16}; // d0
+ let Inst{12...13} = 0;
+ let Inst{14...31} = D{33...16}; // d0
// The instruction.
- let Inst{38-42} = RST{4-0};
- let Inst{43-47} = RA;
- let Inst{48-63} = D{15-0}; // d1
+ let Inst{38...42} = RST{4...0};
+ let Inst{43...47} = RA;
+ let Inst{48...63} = D{15...0}; // d1
}
class MLS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -222,16 +222,16 @@ class MLS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 2;
- let Inst{8-10} = 0;
+ let Inst{6...7} = 2;
+ let Inst{8...10} = 0;
let Inst{11} = PCRel;
- let Inst{12-13} = 0;
- let Inst{14-31} = SI{33-16};
+ let Inst{12...13} = 0;
+ let Inst{14...31} = SI{33...16};
// The instruction.
- let Inst{38-42} = RT;
- let Inst{43-47} = RA;
- let Inst{48-63} = SI{15-0};
+ let Inst{38...42} = RT;
+ let Inst{43...47} = RA;
+ let Inst{48...63} = SI{15...0};
}
class MLS_DForm_SI34_RT5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -243,16 +243,16 @@ class MLS_DForm_SI34_RT5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 2;
- let Inst{8-10} = 0;
+ let Inst{6...7} = 2;
+ let Inst{8...10} = 0;
let Inst{11} = 0;
- let Inst{12-13} = 0;
- let Inst{14-31} = SI{33-16};
+ let Inst{12...13} = 0;
+ let Inst{14...31} = SI{33...16};
// The instruction.
- let Inst{38-42} = RT;
- let Inst{43-47} = 0;
- let Inst{48-63} = SI{15-0};
+ let Inst{38...42} = RT;
+ let Inst{43...47} = 0;
+ let Inst{48...63} = SI{15...0};
}
multiclass MLS_DForm_R_SI34_RTA5_p<bits<6> opcode, dag OOL, dag IOL,
@@ -274,15 +274,15 @@ class 8LS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
// The prefix.
- let Inst{6-10} = 0;
+ let Inst{6...10} = 0;
let Inst{11} = PCRel;
- let Inst{12-13} = 0;
- let Inst{14-31} = D{33-16}; // d0
+ let Inst{12...13} = 0;
+ let Inst{14...31} = D{33...16}; // d0
// The instruction.
- let Inst{38-42} = RST{4-0};
- let Inst{43-47} = RA;
- let Inst{48-63} = D{15-0}; // d1
+ let Inst{38...42} = RST{4...0};
+ let Inst{43...47} = RA;
+ let Inst{48...63} = D{15...0}; // d1
}
// 8LS:D-Form: [ 1 0 0 // R // d0
@@ -298,18 +298,18 @@ class 8LS_DForm_R_SI34_XT6_RA5_MEM<bits<5> opcode, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 0;
+ let Inst{6...7} = 0;
let Inst{8} = 0;
- let Inst{9-10} = 0; // reserved
+ let Inst{9...10} = 0; // reserved
let Inst{11} = PCRel;
- let Inst{12-13} = 0; // reserved
- let Inst{14-31} = D{33-16}; // d0
+ let Inst{12...13} = 0; // reserved
+ let Inst{14...31} = D{33...16}; // d0
// The instruction.
let Inst{37} = XST{5};
- let Inst{38-42} = XST{4-0};
- let Inst{43-47} = RA;
- let Inst{48-63} = D{15-0}; // d1
+ let Inst{38...42} = XST{4...0};
+ let Inst{43...47} = RA;
+ let Inst{48...63} = D{15...0}; // d1
}
// X-Form: [PO T IMM VRB XO TX]
@@ -321,10 +321,10 @@ class XForm_XT6_IMM5_VB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
bits<5> IMM;
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = IMM;
- let Inst{16-20} = VRB;
- let Inst{21-30} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = IMM;
+ let Inst{16...20} = VRB;
+ let Inst{21...30} = xo;
let Inst{31} = XT{5};
}
@@ -341,19 +341,19 @@ class 8RR_XX4Form_IMM8_XTAB6<bits<6> opcode, bits<2> xo,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 1;
+ let Inst{6...7} = 1;
let Inst{8} = 0;
- let Inst{9-11} = 0;
- let Inst{12-13} = 0;
- let Inst{14-23} = 0;
- let Inst{24-31} = IMM;
+ let Inst{9...11} = 0;
+ let Inst{12...13} = 0;
+ let Inst{14...23} = 0;
+ let Inst{24...31} = IMM;
// The instruction.
- let Inst{38-42} = XT{4-0};
- let Inst{43-47} = XA{4-0};
- let Inst{48-52} = XB{4-0};
- let Inst{53-57} = XC{4-0};
- let Inst{58-59} = xo;
+ let Inst{38...42} = XT{4...0};
+ let Inst{43...47} = XA{4...0};
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...57} = XC{4...0};
+ let Inst{58...59} = xo;
let Inst{60} = XC{5};
let Inst{61} = XA{5};
let Inst{62} = XB{5};
@@ -369,11 +369,11 @@ class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RD;
- let Inst{11-12} = 0;
- let Inst{13-15} = N;
- let Inst{16-20} = VB;
- let Inst{21-31} = xo;
+ let Inst{6...10} = RD;
+ let Inst{11...12} = 0;
+ let Inst{13...15} = N;
+ let Inst{16...20} = VB;
+ let Inst{21...31} = xo;
}
@@ -382,14 +382,14 @@ class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
class VXForm_VTB5_RA5_ins<bits<11> xo, string opc, list<dag> pattern>
: VXForm_1<xo, (outs vrrc:$VD), (ins vrrc:$VDi, gprc:$VA, vrrc:$VB),
!strconcat(opc, " $VD, $VA, $VB"), IIC_VecGeneral, pattern>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
// VX-Form: [PO VRT RA RB XO].
// Destructive (insert) forms are suffixed with _ins.
class VXForm_VRT5_RAB5_ins<bits<11> xo, string opc, list<dag> pattern>
: VXForm_1<xo, (outs vrrc:$VD), (ins vrrc:$VDi, gprc:$VA, gprc:$VB),
!strconcat(opc, " $VD, $VA, $VB"), IIC_VecGeneral, pattern>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
// VX-Form: [ PO BF // VRA VRB XO ]
class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -401,11 +401,11 @@ class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-8} = BF;
- let Inst{9-10} = 0;
- let Inst{11-15} = VA;
- let Inst{16-20} = VB;
- let Inst{21-31} = xo;
+ let Inst{6...8} = BF;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = VA;
+ let Inst{16...20} = VB;
+ let Inst{21...31} = xo;
}
// VN-Form: [PO VRT VRA VRB PS SD XO]
@@ -420,12 +420,12 @@ class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = VRT;
- let Inst{11-15} = VRA;
- let Inst{16-20} = VRB;
- let Inst{21-22} = ps;
- let Inst{23-25} = SD;
- let Inst{26-31} = xo;
+ let Inst{6...10} = VRT;
+ let Inst{11...15} = VRA;
+ let Inst{16...20} = VRB;
+ let Inst{21...22} = ps;
+ let Inst{23...25} = SD;
+ let Inst{26...31} = xo;
}
class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL,
@@ -437,11 +437,11 @@ class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = RD;
- let Inst{11-14} = eo;
+ let Inst{6...10} = RD;
+ let Inst{11...14} = eo;
let Inst{15} = MP;
- let Inst{16-20} = VB;
- let Inst{21-31} = xo;
+ let Inst{16...20} = VB;
+ let Inst{21...31} = xo;
}
// 8RR:D-Form: [ 1 1 0 // // imm0
@@ -456,17 +456,17 @@ class 8RR_DForm_IMM32_XT6<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 1;
- let Inst{8-11} = 0;
- let Inst{12-13} = 0; // reserved
- let Inst{14-15} = 0; // reserved
- let Inst{16-31} = IMM32{31-16};
+ let Inst{6...7} = 1;
+ let Inst{8...11} = 0;
+ let Inst{12...13} = 0; // reserved
+ let Inst{14...15} = 0; // reserved
+ let Inst{16...31} = IMM32{31...16};
// The instruction.
- let Inst{38-42} = XT{4-0};
- let Inst{43-46} = xo;
+ let Inst{38...42} = XT{4...0};
+ let Inst{43...46} = xo;
let Inst{47} = XT{5};
- let Inst{48-63} = IMM32{15-0};
+ let Inst{48...63} = IMM32{15...0};
}
// 8RR:D-Form: [ 1 1 0 // // imm0
@@ -482,18 +482,18 @@ class 8RR_DForm_IMM32_XT6_IX<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 1;
- let Inst{8-11} = 0;
- let Inst{12-13} = 0; // reserved
- let Inst{14-15} = 0; // reserved
- let Inst{16-31} = IMM32{31-16};
+ let Inst{6...7} = 1;
+ let Inst{8...11} = 0;
+ let Inst{12...13} = 0; // reserved
+ let Inst{14...15} = 0; // reserved
+ let Inst{16...31} = IMM32{31...16};
// The instruction.
- let Inst{38-42} = XT{4-0};
- let Inst{43-45} = xo;
+ let Inst{38...42} = XT{4...0};
+ let Inst{43...45} = xo;
let Inst{46} = IX;
let Inst{47} = XT{5};
- let Inst{48-63} = IMM32{15-0};
+ let Inst{48...63} = IMM32{15...0};
}
class 8RR_XX4Form_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
@@ -507,17 +507,17 @@ class 8RR_XX4Form_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 1;
- let Inst{8-11} = 0;
- let Inst{12-13} = 0;
- let Inst{14-31} = 0;
+ let Inst{6...7} = 1;
+ let Inst{8...11} = 0;
+ let Inst{12...13} = 0;
+ let Inst{14...31} = 0;
// The instruction.
- let Inst{38-42} = XT{4-0};
- let Inst{43-47} = XA{4-0};
- let Inst{48-52} = XB{4-0};
- let Inst{53-57} = XC{4-0};
- let Inst{58-59} = xo;
+ let Inst{38...42} = XT{4...0};
+ let Inst{43...47} = XA{4...0};
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...57} = XC{4...0};
+ let Inst{58...59} = xo;
let Inst{60} = XC{5};
let Inst{61} = XA{5};
let Inst{62} = XB{5};
@@ -537,18 +537,18 @@ class 8RR_XX4Form_IMM3_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 1;
- let Inst{8-11} = 0;
- let Inst{12-13} = 0;
- let Inst{14-28} = 0;
- let Inst{29-31} = IMM;
+ let Inst{6...7} = 1;
+ let Inst{8...11} = 0;
+ let Inst{12...13} = 0;
+ let Inst{14...28} = 0;
+ let Inst{29...31} = IMM;
// The instruction.
- let Inst{38-42} = XT{4-0};
- let Inst{43-47} = XA{4-0};
- let Inst{48-52} = XB{4-0};
- let Inst{53-57} = XC{4-0};
- let Inst{58-59} = xo;
+ let Inst{38...42} = XT{4...0};
+ let Inst{43...47} = XA{4...0};
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...57} = XC{4...0};
+ let Inst{58...59} = xo;
let Inst{60} = XC{5};
let Inst{61} = XA{5};
let Inst{62} = XB{5};
@@ -565,11 +565,11 @@ class XX2_BF3_XO5_XB6_XO9<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL,
let Pattern = pattern;
- let Inst{6-8} = BF;
- let Inst{9-10} = 0;
- let Inst{11-15} = xo2;
- let Inst{16-20} = XB{4-0};
- let Inst{21-29} = xo;
+ let Inst{6...8} = BF;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = xo2;
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...29} = xo;
let Inst{30} = XB{5};
let Inst{31} = 0;
}
@@ -863,11 +863,11 @@ class DQForm_XTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-9} = XTp{3-0};
+ let Inst{6...9} = XTp{3...0};
let Inst{10} = XTp{4};
- let Inst{11-15} = RA;
- let Inst{16-27} = DQ;
- let Inst{28-31} = xo;
+ let Inst{11...15} = RA;
+ let Inst{16...27} = DQ;
+ let Inst{28...31} = xo;
}
class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
@@ -878,11 +878,11 @@ class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
bits<5> RB;
let Pattern = pattern;
- let Inst{6-9} = XTp{3-0};
+ let Inst{6...9} = XTp{3...0};
let Inst{10} = XTp{4};
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-30} = xo;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -896,16 +896,16 @@ class 8LS_DForm_R_XTp5_SI34_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
// The prefix.
- let Inst{6-10} = 0;
+ let Inst{6...10} = 0;
let Inst{11} = PCRel;
- let Inst{12-13} = 0;
- let Inst{14-31} = D{33-16}; // Imm18
+ let Inst{12...13} = 0;
+ let Inst{14...31} = D{33...16}; // Imm18
// The instruction.
- let Inst{38-41} = XTp{3-0};
+ let Inst{38...41} = XTp{3...0};
let Inst{42} = XTp{4};
- let Inst{43-47} = RA;
- let Inst{48-63} = D{15-0};
+ let Inst{43...47} = RA;
+ let Inst{48...63} = D{15...0};
}
multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> opcode, dag OOL,
@@ -935,11 +935,11 @@ class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = AT;
- let Inst{9-10} = 0;
- let Inst{11-15} = xo2;
- let Inst{16-20} = 0;
- let Inst{21-30} = xo;
+ let Inst{6...8} = AT;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = xo2;
+ let Inst{16...20} = 0;
+ let Inst{21...30} = xo;
let Inst{31} = 0;
}
@@ -952,10 +952,10 @@ class XForm_XT6_IMM5<bits<6> opcode, bits<5> eo, bits<10> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-10} = XT{4-0};
- let Inst{11-15} = eo;
- let Inst{16-20} = UIM;
- let Inst{21-30} = xo;
+ let Inst{6...10} = XT{4...0};
+ let Inst{11...15} = eo;
+ let Inst{16...20} = UIM;
+ let Inst{21...30} = xo;
let Inst{31} = XT{5};
}
@@ -969,11 +969,11 @@ class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
- let Inst{6-8} = AT;
- let Inst{9-10} = 0;
- let Inst{11-15} = XA{4-0};
- let Inst{16-20} = XB{4-0};
- let Inst{21-28} = xo;
+ let Inst{6...8} = AT;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = XA{4...0};
+ let Inst{16...20} = XB{4...0};
+ let Inst{21...28} = xo;
let Inst{29} = XA{5};
let Inst{30} = XB{5};
let Inst{31} = 0;
@@ -993,20 +993,20 @@ class MMIRR_XX3Form_XY4P2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 3;
- let Inst{8-11} = 9;
- let Inst{12-15} = 0;
- let Inst{16-17} = PMSK;
- let Inst{18-23} = 0;
- let Inst{24-27} = XMSK;
- let Inst{28-31} = YMSK;
+ let Inst{6...7} = 3;
+ let Inst{8...11} = 9;
+ let Inst{12...15} = 0;
+ let Inst{16...17} = PMSK;
+ let Inst{18...23} = 0;
+ let Inst{24...27} = XMSK;
+ let Inst{28...31} = YMSK;
// The instruction.
- let Inst{38-40} = AT;
- let Inst{41-42} = 0;
- let Inst{43-47} = XA{4-0};
- let Inst{48-52} = XB{4-0};
- let Inst{53-60} = xo;
+ let Inst{38...40} = AT;
+ let Inst{41...42} = 0;
+ let Inst{43...47} = XA{4...0};
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...60} = xo;
let Inst{61} = XA{5};
let Inst{62} = XB{5};
let Inst{63} = 0;
@@ -1025,18 +1025,18 @@ class MMIRR_XX3Form_XY4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 3;
- let Inst{8-11} = 9;
- let Inst{12-23} = 0;
- let Inst{24-27} = XMSK;
- let Inst{28-31} = YMSK;
+ let Inst{6...7} = 3;
+ let Inst{8...11} = 9;
+ let Inst{12...23} = 0;
+ let Inst{24...27} = XMSK;
+ let Inst{28...31} = YMSK;
// The instruction.
- let Inst{38-40} = AT;
- let Inst{41-42} = 0;
- let Inst{43-47} = XA{4-0};
- let Inst{48-52} = XB{4-0};
- let Inst{53-60} = xo;
+ let Inst{38...40} = AT;
+ let Inst{41...42} = 0;
+ let Inst{43...47} = XA{4...0};
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...60} = xo;
let Inst{61} = XA{5};
let Inst{62} = XB{5};
let Inst{63} = 0;
@@ -1055,19 +1055,19 @@ class MMIRR_XX3Form_X4Y2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 3;
- let Inst{8-11} = 9;
- let Inst{12-23} = 0;
- let Inst{24-27} = XMSK;
- let Inst{28-29} = YMSK;
- let Inst{30-31} = 0;
+ let Inst{6...7} = 3;
+ let Inst{8...11} = 9;
+ let Inst{12...23} = 0;
+ let Inst{24...27} = XMSK;
+ let Inst{28...29} = YMSK;
+ let Inst{30...31} = 0;
// The instruction.
- let Inst{38-40} = AT;
- let Inst{41-42} = 0;
- let Inst{43-47} = XA{4-0};
- let Inst{48-52} = XB{4-0};
- let Inst{53-60} = xo;
+ let Inst{38...40} = AT;
+ let Inst{41...42} = 0;
+ let Inst{43...47} = XA{4...0};
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...60} = xo;
let Inst{61} = XA{5};
let Inst{62} = XB{5};
let Inst{63} = 0;
@@ -1087,19 +1087,19 @@ class MMIRR_XX3Form_XY4P8_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 3;
- let Inst{8-11} = 9;
- let Inst{12-15} = 0;
- let Inst{16-23} = PMSK;
- let Inst{24-27} = XMSK;
- let Inst{28-31} = YMSK;
+ let Inst{6...7} = 3;
+ let Inst{8...11} = 9;
+ let Inst{12...15} = 0;
+ let Inst{16...23} = PMSK;
+ let Inst{24...27} = XMSK;
+ let Inst{28...31} = YMSK;
// The instruction.
- let Inst{38-40} = AT;
- let Inst{41-42} = 0;
- let Inst{43-47} = XA{4-0};
- let Inst{48-52} = XB{4-0};
- let Inst{53-60} = xo;
+ let Inst{38...40} = AT;
+ let Inst{41...42} = 0;
+ let Inst{43...47} = XA{4...0};
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...60} = xo;
let Inst{61} = XA{5};
let Inst{62} = XB{5};
let Inst{63} = 0;
@@ -1119,20 +1119,20 @@ class MMIRR_XX3Form_XYP4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
let Pattern = pattern;
// The prefix.
- let Inst{6-7} = 3;
- let Inst{8-11} = 9;
- let Inst{12-15} = 0;
- let Inst{16-19} = PMSK;
- let Inst{20-23} = 0;
- let Inst{24-27} = XMSK;
- let Inst{28-31} = YMSK;
+ let Inst{6...7} = 3;
+ let Inst{8...11} = 9;
+ let Inst{12...15} = 0;
+ let Inst{16...19} = PMSK;
+ let Inst{20...23} = 0;
+ let Inst{24...27} = XMSK;
+ let Inst{28...31} = YMSK;
// The instruction.
- let Inst{38-40} = AT;
- let Inst{41-42} = 0;
- let Inst{43-47} = XA{4-0};
- let Inst{48-52} = XB{4-0};
- let Inst{53-60} = xo;
+ let Inst{38...40} = AT;
+ let Inst{41...42} = 0;
+ let Inst{43...47} = XA{4...0};
+ let Inst{48...52} = XB{4...0};
+ let Inst{53...60} = xo;
let Inst{61} = XA{5};
let Inst{62} = XB{5};
let Inst{63} = 0;
@@ -1395,7 +1395,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, Predicates = [P
[(set v2i64:$XT,
(PPCxxsplti32dx v2i64:$XTi, i32:$IX,
i32:$IMM32))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+ RegConstraint<"$XTi = $XT">;
}
let Predicates = [IsISA3_1] in {
@@ -1466,13 +1466,13 @@ let Predicates = [IsISA3_1] in {
"vinsw $VD, $VB, $VA", IIC_VecGeneral,
[(set v4i32:$VD,
(int_ppc_altivec_vinsw v4i32:$VDi, i32:$VB, timm:$VA))]>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
def VINSD :
VXForm_1<463, (outs vrrc:$VD), (ins vrrc:$VDi, u4imm:$VA, g8rc:$VB),
"vinsd $VD, $VB, $VA", IIC_VecGeneral,
[(set v2i64:$VD,
(int_ppc_altivec_vinsd v2i64:$VDi, i64:$VB, timm:$VA))]>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
def VINSBVLX :
VXForm_VTB5_RA5_ins<15, "vinsbvlx",
[(set v16i8:$VD,
@@ -1538,13 +1538,13 @@ let Predicates = [IsISA3_1] in {
"vinsdlx $VD, $VA, $VB", IIC_VecGeneral,
[(set v2i64:$VD,
(int_ppc_altivec_vinsdlx v2i64:$VDi, i64:$VA, i64:$VB))]>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
def VINSDRX :
VXForm_1<975, (outs vrrc:$VD), (ins vrrc:$VDi, g8rc:$VA, g8rc:$VB),
"vinsdrx $VD, $VA, $VB", IIC_VecGeneral,
[(set v2i64:$VD,
(int_ppc_altivec_vinsdrx v2i64:$VDi, i64:$VA, i64:$VB))]>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
def VEXTRACTBM : VXForm_RD5_XO5_RS5<1602, 8, (outs gprc:$VD), (ins vrrc:$VB),
"vextractbm $VD, $VB", IIC_VecGeneral,
[(set i32:$VD,
@@ -1915,10 +1915,11 @@ let Predicates = [IsISA3_1] in {
[(set v1i128:$VD,
(int_ppc_altivec_vrlqmi v1i128:$VA, v1i128:$VB,
v1i128:$VDi))]>,
- RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+ RegConstraint<"$VDi = $VD">;
def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
- def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
+ def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq",
+ [(set v4i32:$VD, (PPCvsrq v4i32:$VA, v4i32:$VB))]>;
def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
@@ -2053,6 +2054,9 @@ let Predicates = [IsISA3_1, HasFPU] in {
//---------------------------- Anonymous Patterns ----------------------------//
let Predicates = [IsISA3_1] in {
+ // Exploit vsrq instruction to optimize VSR(VSRO (input, vsro_byte_shift), vsr_bit_shift)
+ // to VSRQ(input, vsrq_bit_shift)
+ def : Pat<(VSRVSRO v4i32:$vA, v4i32:$vB), (VSRQ $vA, $vB)>;
// Exploit the vector multiply high instructions using intrinsics.
def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)),
(v4i32 (VMULHSW $vA, $vB))>;
@@ -2230,6 +2234,13 @@ def VEqv
(v4i32(bitconvert node:$a)),
(v4i32(bitconvert node:$b)))))]>;
+// Vector NAND operation (not(and))
+def VNand
+ : PatFrags<(ops node:$a, node:$b), [(vnot(and node:$a, node:$b)),
+ (bitconvert(vnot(and
+ (v4i32(bitconvert node:$a)),
+ (v4i32(bitconvert node:$b)))))]>;
+
// =============================================================================
// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectAnd
// This class matches the equivalent Ternary Operation: A ? f(B,C) : AND(B,C)
@@ -2265,6 +2276,56 @@ multiclass XXEvalTernarySelectAnd<ValueType Vt> {
Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>;
}
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectB
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : B
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op on vectors B and C (AND, NOR, EQV, NAND)
+// - B is the "false" case operand (vector B)
+//
+// Note: Patterns (A? C : B) and (A? not(C) : B) are not considered
+// for XXEVAL instruction (4 Cycle) as XXSEL (3 cycle) instruction performs
+// better.
+// =============================================================================
+multiclass XXEvalTernarySelectB<ValueType Vt>{
+ // Pattern: (A ? AND(B,C) : B) XXEVAL immediate value: 49
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), Vt:$vB), 49>;
+ // Pattern: (A ? NOR(B,C) : B) XXEVAL immediate value: 56
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), Vt:$vB), 56>;
+ // Pattern: (A ? EQV(B,C) : B) XXEVAL immediate value: 57
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), Vt:$vB), 57>;
+ // Pattern: (A ? NAND(B,C) : B) XXEVAL immediate value: 62
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), Vt:$vB), 62>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectC
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : C
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op on vectors B and C (AND, NOR, EQV, NAND)
+// - C is the "false" case operand (vector C)
+//
+// Note: Patterns (A? B : C) and (A? not(B) : C) are not considered
+// for XXEVAL instruction (4 Cycle) as XXSEL (3 cycle) instruction performs
+// better.
+// =============================================================================
+multiclass XXEvalTernarySelectC<ValueType Vt>{
+ // Pattern: (A ? AND(B,C) : C) XXEVAL immediate value: 81
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), Vt:$vC), 81>;
+ // Pattern: (A ? NOR(B,C) : C) XXEVAL immediate value: 88
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), Vt:$vC), 88>;
+ // Pattern: (A ? EQV(B,C) : C) XXEVAL immediate value: 89
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), Vt:$vC), 89>;
+ // Pattern: (A ? NAND(B,C) : C) XXEVAL immediate value: 94
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), Vt:$vC), 94>;
+}
+
let Predicates = [PrefixInstrs, HasP10Vector] in {
let AddedComplexity = 400 in {
def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
@@ -2376,6 +2437,8 @@ let Predicates = [PrefixInstrs, HasP10Vector] in {
// XXEval Patterns for ternary Operations.
foreach Ty = [v4i32, v2i64, v8i16, v16i8] in {
defm : XXEvalTernarySelectAnd<Ty>;
+ defm : XXEvalTernarySelectB<Ty>;
+ defm : XXEvalTernarySelectC<Ty>;
}
// Anonymous patterns to select prefixed VSX loads and stores.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/llvm/lib/Target/PowerPC/PPCInstrSPE.td
index e91cae349e08..5104cc6f5607 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrSPE.td
@@ -20,10 +20,10 @@ class EFXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-31} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...31} = xo;
}
class EFXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -45,11 +45,11 @@ class EFXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
bits<5> RA;
bits<5> RB;
- let Inst{6-8} = crD;
- let Inst{9-10} = 0;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-31} = xo;
+ let Inst{6...8} = crD;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...31} = xo;
}
class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -61,10 +61,10 @@ class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-31} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...31} = xo;
}
class EVXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -88,11 +88,11 @@ class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-8} = crD;
- let Inst{9-10} = 0;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-31} = xo;
+ let Inst{6...8} = crD;
+ let Inst{9...10} = 0;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...31} = xo;
}
class EVXForm_4<bits<8> xo, dag OOL, dag IOL, string asmstr,
@@ -105,11 +105,11 @@ class EVXForm_4<bits<8> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = RB;
- let Inst{21-28} = xo;
- let Inst{29-31} = crD;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = RB;
+ let Inst{21...28} = xo;
+ let Inst{29...31} = crD;
}
class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -121,10 +121,10 @@ class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
let Pattern = pattern;
- let Inst{6-10} = RT;
- let Inst{11-15} = RA;
- let Inst{16-20} = D;
- let Inst{21-31} = xo;
+ let Inst{6...10} = RT;
+ let Inst{11...15} = RA;
+ let Inst{16...20} = D;
+ let Inst{21...31} = xo;
}
let DecoderNamespace = "SPE", Predicates = [HasSPE] in {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 19448210f5db..4e5165bfcda5 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -236,7 +236,7 @@ class X_VT5_VA5_VB5_FMA<bits<6> opcode, bits<10> xo, string opc,
list<dag> pattern>
: XForm_1<opcode, xo, (outs vrrc:$RST), (ins vrrc:$RSTi, vrrc:$RA, vrrc:$RB),
!strconcat(opc, " $RST, $RA, $RB"), IIC_VecFP, pattern>,
- RegConstraint<"$RSTi = $RST">, NoEncode<"$RSTi">;
+ RegConstraint<"$RSTi = $RST">;
// [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
class X_VT5_VA5_VB5_FMA_Ro<bits<6> opcode, bits<10> xo, string opc,
@@ -402,13 +402,13 @@ let hasSideEffects = 0 in {
(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
"xsmaddadp $XT, $XA, $XB", IIC_VecFP,
[(set f64:$XT, (any_fma f64:$XA, f64:$XB, f64:$XTi))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XSMADDMDP : XX3Form<60, 41,
(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
"xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -418,13 +418,13 @@ let hasSideEffects = 0 in {
(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
"xsmsubadp $XT, $XA, $XB", IIC_VecFP,
[(set f64:$XT, (any_fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XSMSUBMDP : XX3Form<60, 57,
(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
"xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -434,13 +434,13 @@ let hasSideEffects = 0 in {
(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
"xsnmaddadp $XT, $XA, $XB", IIC_VecFP,
[(set f64:$XT, (fneg (any_fma f64:$XA, f64:$XB, f64:$XTi)))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XSNMADDMDP : XX3Form<60, 169,
(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
"xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -450,13 +450,13 @@ let hasSideEffects = 0 in {
(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
"xsnmsubadp $XT, $XA, $XB", IIC_VecFP,
[(set f64:$XT, (fneg (any_fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XSNMSUBMDP : XX3Form<60, 185,
(outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
"xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -466,13 +466,13 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvmaddadp $XT, $XA, $XB", IIC_VecFP,
[(set v2f64:$XT, (any_fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XVMADDMDP : XX3Form<60, 105,
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -482,13 +482,13 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvmaddasp $XT, $XA, $XB", IIC_VecFP,
[(set v4f32:$XT, (any_fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XVMADDMSP : XX3Form<60, 73,
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -498,13 +498,13 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvmsubadp $XT, $XA, $XB", IIC_VecFP,
[(set v2f64:$XT, (any_fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XVMSUBMDP : XX3Form<60, 121,
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -514,13 +514,13 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvmsubasp $XT, $XA, $XB", IIC_VecFP,
[(set v4f32:$XT, (any_fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XVMSUBMSP : XX3Form<60, 89,
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -530,13 +530,13 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvnmaddadp $XT, $XA, $XB", IIC_VecFP,
[(set v2f64:$XT, (fneg (any_fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XVNMADDMDP : XX3Form<60, 233,
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -546,13 +546,13 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvnmaddasp $XT, $XA, $XB", IIC_VecFP,
[(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XVNMADDMSP : XX3Form<60, 201,
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -562,13 +562,13 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvnmsubadp $XT, $XA, $XB", IIC_VecFP,
[(set v2f64:$XT, (fneg (any_fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XVNMSUBMDP : XX3Form<60, 249,
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -578,13 +578,13 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvnmsubasp $XT, $XA, $XB", IIC_VecFP,
[(set v4f32:$XT, (fneg (any_fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
let IsVSXFMAAlt = 1 in
def XVNMSUBMSP : XX3Form<60, 217,
(outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
"xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -1199,7 +1199,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
"xsmaddasp $XT, $XA, $XB", IIC_VecFP,
[(set f32:$XT, (any_fma f32:$XA, f32:$XB, f32:$XTi))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
let IsVSXFMAAlt = 1, hasSideEffects = 1 in
@@ -1207,7 +1207,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
(outs vssrc:$XT),
(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
"xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -1219,7 +1219,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
"xsmsubasp $XT, $XA, $XB", IIC_VecFP,
[(set f32:$XT, (any_fma f32:$XA, f32:$XB,
(fneg f32:$XTi)))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
let IsVSXFMAAlt = 1, hasSideEffects = 1 in
@@ -1227,7 +1227,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
(outs vssrc:$XT),
(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
"xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -1239,7 +1239,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
"xsnmaddasp $XT, $XA, $XB", IIC_VecFP,
[(set f32:$XT, (fneg (any_fma f32:$XA, f32:$XB,
f32:$XTi)))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
let IsVSXFMAAlt = 1, hasSideEffects = 1 in
@@ -1247,7 +1247,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
(outs vssrc:$XT),
(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
"xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -1259,7 +1259,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
"xsnmsubasp $XT, $XA, $XB", IIC_VecFP,
[(set f32:$XT, (fneg (any_fma f32:$XA, f32:$XB,
(fneg f32:$XTi))))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
let IsVSXFMAAlt = 1, hasSideEffects = 1 in
@@ -1267,7 +1267,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
(outs vssrc:$XT),
(ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
"xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
AltVSXFMARel;
}
@@ -1563,7 +1563,7 @@ let Predicates = [HasVSX, HasP9Vector] in {
"xxinsertw $XT, $XB, $UIM5", IIC_VecFP,
[(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB,
imm32SExt16:$UIM5))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+ RegConstraint<"$XTi = $XT">;
// Vector Extract Unsigned Word
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
@@ -1652,11 +1652,11 @@ let Predicates = [HasVSX, HasP9Vector] in {
def XXPERM : XX3Form<60, 26, (outs vsrc:$XT),
(ins vsrc:$XA, vsrc:$XTi, vsrc:$XB),
"xxperm $XT, $XA, $XB", IIC_VecPerm, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+ RegConstraint<"$XTi = $XT">;
def XXPERMR : XX3Form<60, 58, (outs vsrc:$XT),
(ins vsrc:$XA, vsrc:$XTi, vsrc:$XB),
"xxpermr $XT, $XA, $XB", IIC_VecPerm, []>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+ RegConstraint<"$XTi = $XT">;
// Vector Splat Immediate Byte
def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 996b6efb320d..736ba1edcaea 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -52,12 +52,11 @@ PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
return *this;
}
-PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
- const std::string &TuneCPU, const std::string &FS,
- const PPCTargetMachine &TM)
- : PPCGenSubtargetInfo(TT, CPU, TuneCPU, FS), TargetTriple(TT),
- IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
- TargetTriple.getArch() == Triple::ppc64le),
+PPCSubtarget::PPCSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+ StringRef FS, const PPCTargetMachine &TM)
+ : PPCGenSubtargetInfo(TT, CPU, TuneCPU, FS),
+ IsPPC64(getTargetTriple().getArch() == Triple::ppc64 ||
+ getTargetTriple().getArch() == Triple::ppc64le),
TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
InstrInfo(*this), TLInfo(TM, *this) {
TSInfo = std::make_unique<PPCSelectionDAGInfo>();
@@ -87,10 +86,10 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
// Determine default and user specified characteristics
std::string CPUName = std::string(CPU);
if (CPUName.empty() || CPU == "generic") {
- if (TargetTriple.getSubArch() == Triple::PPCSubArch_spe)
+ if (getTargetTriple().getSubArch() == Triple::PPCSubArch_spe)
CPUName = "e500";
else
- CPUName = std::string(PPC::getNormalizedPPCTargetCPU(TargetTriple));
+ CPUName = std::string(PPC::getNormalizedPPCTargetCPU(getTargetTriple()));
}
// Determine the CPU to schedule for.
@@ -107,7 +106,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
if (IsPPC64 && has64BitSupport())
Use64BitRegs = true;
- if (TargetTriple.isPPC32SecurePlt())
+ if (getTargetTriple().isPPC32SecurePlt())
IsSecurePlt = true;
if (HasSPE && IsPPC64)
@@ -126,7 +125,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
IsLittleEndian = TM.isLittleEndian();
if (HasAIXSmallLocalExecTLS || HasAIXSmallLocalDynamicTLS) {
- if (!TargetTriple.isOSAIX() || !IsPPC64)
+ if (!getTargetTriple().isOSAIX() || !IsPPC64)
report_fatal_error("The aix-small-local-[exec|dynamic]-tls attribute is "
"only supported on AIX in "
"64-bit mode.\n",
@@ -143,7 +142,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
false);
}
- if (HasAIXShLibTLSModelOpt && (!TargetTriple.isOSAIX() || !IsPPC64))
+ if (HasAIXShLibTLSModelOpt && (!getTargetTriple().isOSAIX() || !IsPPC64))
report_fatal_error("The aix-shared-lib-tls-model-opt attribute "
"is only supported on AIX in 64-bit mode.\n",
false);
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 3c59a475c7eb..c17fca7f70a3 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -78,9 +78,6 @@ public:
};
protected:
- /// TargetTriple - What processor and OS we're targeting.
- Triple TargetTriple;
-
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
Align StackAlignment;
@@ -119,8 +116,7 @@ public:
/// This constructor initializes the data members to match that
/// of the specified triple.
///
- PPCSubtarget(const Triple &TT, const std::string &CPU,
- const std::string &TuneCPU, const std::string &FS,
+ PPCSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
const PPCTargetMachine &TM);
~PPCSubtarget() override;
@@ -210,13 +206,11 @@ public:
POPCNTDKind hasPOPCNTD() const { return HasPOPCNTD; }
- const Triple &getTargetTriple() const { return TargetTriple; }
-
- bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
- bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
- bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetELF() const { return getTargetTriple().isOSBinFormatELF(); }
+ bool isTargetMachO() const { return getTargetTriple().isOSBinFormatMachO(); }
+ bool isTargetLinux() const { return getTargetTriple().isOSLinux(); }
- bool isAIXABI() const { return TargetTriple.isOSAIX(); }
+ bool isAIXABI() const { return getTargetTriple().isOSAIX(); }
bool isSVR4ABI() const { return !isAIXABI(); }
bool isELFv2ABI() const;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index b5c6ac111dff..ae92d5eab20c 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -129,7 +129,7 @@ LLVMInitializePowerPCTarget() {
initializePPCLoopInstrFormPrepPass(PR);
initializePPCTOCRegDepsPass(PR);
initializePPCEarlyReturnPass(PR);
- initializePPCVSXCopyPass(PR);
+ initializePPCVSXWACCCopyPass(PR);
initializePPCVSXFMAMutatePass(PR);
initializePPCVSXSwapRemovalPass(PR);
initializePPCReduceCRLogicalsPass(PR);
@@ -528,7 +528,7 @@ bool PPCPassConfig::addInstSelector() {
addPass(createPPCCTRLoopsVerify());
#endif
- addPass(createPPCVSXCopyPass());
+ addPass(createPPCVSXWACCCopyPass());
return false;
}
diff --git a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
deleted file mode 100644
index 794095cd4376..000000000000
--- a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A pass which deals with the complexity of generating legal VSX register
-// copies to/from register classes which partially overlap with the VSX
-// register file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPC.h"
-#include "PPCInstrInfo.h"
-#include "PPCTargetMachine.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "ppc-vsx-copy"
-
-namespace {
- // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers
- // (Altivec and scalar floating-point registers), we need to transform the
- // copies into subregister copies with other restrictions.
- struct PPCVSXCopy : public MachineFunctionPass {
- static char ID;
- PPCVSXCopy() : MachineFunctionPass(ID) {}
-
- const TargetInstrInfo *TII;
-
- bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
- MachineRegisterInfo &MRI) {
- if (Register::isVirtualRegister(Reg)) {
- return RC->hasSubClassEq(MRI.getRegClass(Reg));
- } else if (RC->contains(Reg)) {
- return true;
- }
-
- return false;
- }
-
- bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
- return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
- }
-
- bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
- return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
- }
-
- bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
- return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
- }
-
- bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) {
- return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI);
- }
-
- bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) {
- return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI);
- }
-
-protected:
- bool processBlock(MachineBasicBlock &MBB) {
- bool Changed = false;
-
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- for (MachineInstr &MI : MBB) {
- if (!MI.isFullCopy())
- continue;
-
- MachineOperand &DstMO = MI.getOperand(0);
- MachineOperand &SrcMO = MI.getOperand(1);
-
- if ( IsVSReg(DstMO.getReg(), MRI) &&
- !IsVSReg(SrcMO.getReg(), MRI)) {
- // This is a copy *to* a VSX register from a non-VSX register.
- Changed = true;
-
- const TargetRegisterClass *SrcRC = &PPC::VSLRCRegClass;
- assert((IsF8Reg(SrcMO.getReg(), MRI) ||
- IsVSSReg(SrcMO.getReg(), MRI) ||
- IsVSFReg(SrcMO.getReg(), MRI)) &&
- "Unknown source for a VSX copy");
-
- Register NewVReg = MRI.createVirtualRegister(SrcRC);
- BuildMI(MBB, MI, MI.getDebugLoc(),
- TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
- .addImm(1) // add 1, not 0, because there is no implicit clearing
- // of the high bits.
- .add(SrcMO)
- .addImm(PPC::sub_64);
-
- // The source of the original copy is now the new virtual register.
- SrcMO.setReg(NewVReg);
- } else if (!IsVSReg(DstMO.getReg(), MRI) &&
- IsVSReg(SrcMO.getReg(), MRI)) {
- // This is a copy *from* a VSX register to a non-VSX register.
- Changed = true;
-
- const TargetRegisterClass *DstRC = &PPC::VSLRCRegClass;
- assert((IsF8Reg(DstMO.getReg(), MRI) ||
- IsVSFReg(DstMO.getReg(), MRI) ||
- IsVSSReg(DstMO.getReg(), MRI)) &&
- "Unknown destination for a VSX copy");
-
- // Copy the VSX value into a new VSX register of the correct subclass.
- Register NewVReg = MRI.createVirtualRegister(DstRC);
- BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
- NewVReg)
- .add(SrcMO);
-
- // Transform the original copy into a subregister extraction copy.
- SrcMO.setReg(NewVReg);
- SrcMO.setSubReg(PPC::sub_64);
- }
- }
-
- return Changed;
- }
-
-public:
- bool runOnMachineFunction(MachineFunction &MF) override {
- // If we don't have VSX on the subtarget, don't do anything.
- const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
- if (!STI.hasVSX())
- return false;
- TII = STI.getInstrInfo();
-
- bool Changed = false;
-
- for (MachineBasicBlock &B : llvm::make_early_inc_range(MF))
- if (processBlock(B))
- Changed = true;
-
- return Changed;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- MachineFunctionPass::getAnalysisUsage(AU);
- }
- };
- } // end anonymous namespace
-
-INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE,
- "PowerPC VSX Copy Legalization", false, false)
-
-char PPCVSXCopy::ID = 0;
-FunctionPass*
-llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); }
diff --git a/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp b/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp
new file mode 100644
index 000000000000..2ec566ddb0b8
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp
@@ -0,0 +1,182 @@
+//===--------- PPCVSXWACCCopy.cpp - VSX and WACC Copy Legalization --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass which deals with the complexity of generating legal VSX register
+// copies to/from register classes which partially overlap with the VSX
+// register file and combines the wacc/wacc_hi copies when needed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-vsx-copy"
+
+namespace {
+// PPCVSXWACCCopy pass - For copies between VSX registers and non-VSX registers
+// (Altivec and scalar floating-point registers), we need to transform the
+// copies into subregister copies with other restrictions.
+struct PPCVSXWACCCopy : public MachineFunctionPass {
+ static char ID;
+ PPCVSXWACCCopy() : MachineFunctionPass(ID) {}
+
+ const TargetInstrInfo *TII;
+
+ bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
+ MachineRegisterInfo &MRI) {
+ if (Register::isVirtualRegister(Reg)) {
+ return RC->hasSubClassEq(MRI.getRegClass(Reg));
+ } else if (RC->contains(Reg)) {
+ return true;
+ }
+
+ return false;
+ }
+
+ bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
+ }
+
+ bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
+ }
+
+ bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
+ }
+
+ bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI);
+ }
+
+ bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI);
+ }
+
+protected:
+ bool processBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ for (MachineInstr &MI : MBB) {
+ if (!MI.isFullCopy())
+ continue;
+
+ MachineOperand &DstMO = MI.getOperand(0);
+ MachineOperand &SrcMO = MI.getOperand(1);
+
+ if (IsVSReg(DstMO.getReg(), MRI) && !IsVSReg(SrcMO.getReg(), MRI)) {
+ // This is a copy *to* a VSX register from a non-VSX register.
+ Changed = true;
+
+ const TargetRegisterClass *SrcRC = &PPC::VSLRCRegClass;
+ assert((IsF8Reg(SrcMO.getReg(), MRI) || IsVSSReg(SrcMO.getReg(), MRI) ||
+ IsVSFReg(SrcMO.getReg(), MRI)) &&
+ "Unknown source for a VSX copy");
+
+ Register NewVReg = MRI.createVirtualRegister(SrcRC);
+ BuildMI(MBB, MI, MI.getDebugLoc(),
+ TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
+ .addImm(1) // add 1, not 0, because there is no implicit clearing
+ // of the high bits.
+ .add(SrcMO)
+ .addImm(PPC::sub_64);
+
+ // The source of the original copy is now the new virtual register.
+ SrcMO.setReg(NewVReg);
+ } else if (!IsVSReg(DstMO.getReg(), MRI) &&
+ IsVSReg(SrcMO.getReg(), MRI)) {
+ // This is a copy *from* a VSX register to a non-VSX register.
+ Changed = true;
+
+ const TargetRegisterClass *DstRC = &PPC::VSLRCRegClass;
+ assert((IsF8Reg(DstMO.getReg(), MRI) || IsVSFReg(DstMO.getReg(), MRI) ||
+ IsVSSReg(DstMO.getReg(), MRI)) &&
+ "Unknown destination for a VSX copy");
+
+ // Copy the VSX value into a new VSX register of the correct subclass.
+ Register NewVReg = MRI.createVirtualRegister(DstRC);
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+ NewVReg)
+ .add(SrcMO);
+
+ // Transform the original copy into a subregister extraction copy.
+ SrcMO.setReg(NewVReg);
+ SrcMO.setSubReg(PPC::sub_64);
+ } else if (IsRegInClass(DstMO.getReg(), &PPC::WACC_HIRCRegClass, MRI) &&
+ IsRegInClass(SrcMO.getReg(), &PPC::WACCRCRegClass, MRI)) {
+ // Matches the pattern:
+ // %a:waccrc = COPY %b.sub_wacc_hi:dmrrc
+ // %c:wacc_hirc = COPY %a:waccrc
+ // And replaces it with:
+ // %c:wacc_hirc = COPY %b.sub_wacc_hi:dmrrc
+ MachineInstr *DefMI = MRI.getUniqueVRegDef(SrcMO.getReg());
+ if (!DefMI || !DefMI->isCopy())
+ continue;
+
+ MachineOperand &OrigSrc = DefMI->getOperand(1);
+
+ if (!IsRegInClass(OrigSrc.getReg(), &PPC::DMRRCRegClass, MRI))
+ continue;
+
+ if (OrigSrc.getSubReg() != PPC::sub_wacc_hi)
+ continue;
+
+ // Rewrite the second copy to use the original register's subreg
+ SrcMO.setReg(OrigSrc.getReg());
+ SrcMO.setSubReg(PPC::sub_wacc_hi);
+ Changed = true;
+
+ // Remove the intermediate copy if safe
+ if (MRI.use_nodbg_empty(DefMI->getOperand(0).getReg()))
+ DefMI->eraseFromParent();
+ }
+ }
+
+ return Changed;
+ }
+
+public:
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ // If we don't have VSX on the subtarget, don't do anything.
+ const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+ if (!STI.hasVSX())
+ return false;
+ TII = STI.getInstrInfo();
+
+ bool Changed = false;
+
+ for (MachineBasicBlock &B : llvm::make_early_inc_range(MF))
+ if (processBlock(B))
+ Changed = true;
+
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // end anonymous namespace
+
+INITIALIZE_PASS(PPCVSXWACCCopy, DEBUG_TYPE, "PowerPC VSX Copy Legalization",
+ false, false)
+
+char PPCVSXWACCCopy::ID = 0;
+FunctionPass *llvm::createPPCVSXWACCCopyPass() { return new PPCVSXWACCCopy(); }
diff --git a/llvm/lib/Target/PowerPC/README_P9.txt b/llvm/lib/Target/PowerPC/README_P9.txt
index ee1ea735acad..208c8abfdc5f 100644
--- a/llvm/lib/Target/PowerPC/README_P9.txt
+++ b/llvm/lib/Target/PowerPC/README_P9.txt
@@ -224,22 +224,22 @@ VSX:
. isCommutable = 1
// xsmaddqp
[(set f128:$vT, (fma f128:$vA, f128:$vB, f128:$vTi))]>,
- RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ RegConstraint<"$vTi = $vT">,
AltVSXFMARel;
// xsmsubqp
[(set f128:$vT, (fma f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
- RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ RegConstraint<"$vTi = $vT">,
AltVSXFMARel;
// xsnmaddqp
[(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, f128:$vTi)))]>,
- RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ RegConstraint<"$vTi = $vT">,
AltVSXFMARel;
// xsnmsubqp
[(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
- RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ RegConstraint<"$vTi = $vT">,
AltVSXFMARel;
- Round to Odd of QP (Negative) Multiply-{Add/Subtract}:
@@ -276,22 +276,22 @@ VSX:
. isCommutable = 1
// xsmaddqpo
[(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, f128:$vTi))]>,
- RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ RegConstraint<"$vTi = $vT">,
AltVSXFMARel;
// xsmsubqpo
[(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
- RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ RegConstraint<"$vTi = $vT">,
AltVSXFMARel;
// xsnmaddqpo
[(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, f128:$vTi)))]>,
- RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ RegConstraint<"$vTi = $vT">,
AltVSXFMARel;
// xsnmsubqpo
[(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
- RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+ RegConstraint<"$vTi = $vT">,
AltVSXFMARel;
- QP Compare Ordered/Unordered: xscmpoqp xscmpuqp
@@ -405,7 +405,7 @@ Fixed Point Facility:
But how to map to it??
[(set v1f128:$XT, (insertelement v1f128:$XTi, f128:$XB, i4:$UIMM))]>,
- RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+ RegConstraint<"$XTi = $XT">,
. Or use intrinsic?
(set v1f128:$XT, (int_ppc_vsx_xxinsertw v1f128:$XTi, f128:$XB, i4:$UIMM))
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 9ce44d0ff7fd..cd8392849ac4 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -121,7 +121,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
bool parseVTypeToken(const AsmToken &Tok, VTypeState &State, unsigned &Sew,
unsigned &Lmul, bool &Fractional, bool &TailAgnostic,
- bool &MaskAgnostic);
+ bool &MaskAgnostic, bool &AltFmt);
bool generateVTypeError(SMLoc ErrorLoc);
bool generateXSfmmVTypeError(SMLoc ErrorLoc);
@@ -2261,14 +2261,23 @@ ParseStatus RISCVAsmParser::parseJALOffset(OperandVector &Operands) {
bool RISCVAsmParser::parseVTypeToken(const AsmToken &Tok, VTypeState &State,
unsigned &Sew, unsigned &Lmul,
bool &Fractional, bool &TailAgnostic,
- bool &MaskAgnostic) {
+ bool &MaskAgnostic, bool &AltFmt) {
if (Tok.isNot(AsmToken::Identifier))
return true;
StringRef Identifier = Tok.getIdentifier();
if (State < VTypeState::SeenSew && Identifier.consume_front("e")) {
- if (Identifier.getAsInteger(10, Sew))
- return true;
+ if (Identifier.getAsInteger(10, Sew)) {
+ if (Identifier == "16alt") {
+ AltFmt = true;
+ Sew = 16;
+ } else if (Identifier == "8alt") {
+ AltFmt = true;
+ Sew = 8;
+ } else {
+ return true;
+ }
+ }
if (!RISCVVType::isValidSEW(Sew))
return true;
@@ -2340,11 +2349,12 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
bool Fractional = false;
bool TailAgnostic = false;
bool MaskAgnostic = false;
+ bool AltFmt = false;
VTypeState State = VTypeState::SeenNothingYet;
do {
if (parseVTypeToken(getTok(), State, Sew, Lmul, Fractional, TailAgnostic,
- MaskAgnostic)) {
+ MaskAgnostic, AltFmt)) {
// The first time, errors return NoMatch rather than Failure
if (State == VTypeState::SeenNothingYet)
return ParseStatus::NoMatch;
@@ -2370,12 +2380,17 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
}
unsigned VTypeI =
- RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic);
+ RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic, AltFmt);
Operands.push_back(RISCVOperand::createVType(VTypeI, S));
return ParseStatus::Success;
}
bool RISCVAsmParser::generateVTypeError(SMLoc ErrorLoc) {
+ if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa))
+ return Error(
+ ErrorLoc,
+ "operand must be "
+ "e[8|8alt|16|16alt|32|64],m[1|2|4|8|f2|f4|f8],[ta|tu],[ma|mu]");
return Error(
ErrorLoc,
"operand must be "
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 47329b2c2f4d..0ff178e1f195 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -7,7 +7,8 @@ tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter)
tablegen(LLVM RISCVGenMacroFusion.inc -gen-macro-fusion-pred)
tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler
+ --specialize-decoders-per-bitwidth)
tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM RISCVGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM RISCVGenMCPseudoLowering.inc -gen-pseudo-lowering)
@@ -87,6 +88,7 @@ add_llvm_target(RISCVCodeGen
GlobalISel
IPO
MC
+ Passes
RISCVDesc
RISCVInfo
Scalar
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index dbb16fce8390..89df9d82f878 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -46,8 +46,6 @@ public:
raw_ostream &CStream) const override;
private:
- void addSPOperands(MCInst &MI) const;
-
DecodeStatus getInstruction48(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &CStream) const;
@@ -196,6 +194,12 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeSPRegisterClass(MCInst &Inst,
+ const MCDisassembler *Decoder) {
+ Inst.addOperand(MCOperand::createReg(RISCV::X2));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -558,7 +562,7 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
return decodeZcmpRlist(Inst, Imm, Address, Decoder);
}
-static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
+static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint16_t Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5);
@@ -600,15 +604,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
#include "RISCVGenDisassemblerTables.inc"
-// Add implied SP operand for C.*SP compressed instructions. The SP operand
-// isn't explicitly encoded in the instruction.
-void RISCVDisassembler::addSPOperands(MCInst &MI) const {
- const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
- for (unsigned i = 0; i < MCID.getNumOperands(); i++)
- if (MCID.operands()[i].RegClass == RISCV::SPRegClassID)
- MI.insert(MI.begin() + i, MCOperand::createReg(RISCV::X2));
-}
-
namespace {
struct DecoderListEntry {
@@ -656,6 +651,13 @@ static constexpr FeatureBitset XSfSystemGroup = {
RISCV::FeatureVendorXSiFivecflushdlone,
};
+static constexpr FeatureBitset XMIPSGroup = {
+ RISCV::FeatureVendorXMIPSLSP,
+ RISCV::FeatureVendorXMIPSCMov,
+ RISCV::FeatureVendorXMIPSCBOP,
+ RISCV::FeatureVendorXMIPSEXECTL,
+};
+
static constexpr FeatureBitset XTHeadGroup = {
RISCV::FeatureVendorXTHeadBa, RISCV::FeatureVendorXTHeadBb,
RISCV::FeatureVendorXTHeadBs, RISCV::FeatureVendorXTHeadCondMov,
@@ -684,13 +686,7 @@ static constexpr DecoderListEntry DecoderList32[]{
{DecoderTableXSfvector32, XSfVectorGroup, "SiFive vector extensions"},
{DecoderTableXSfsystem32, XSfSystemGroup, "SiFive system extensions"},
{DecoderTableXSfcease32, {RISCV::FeatureVendorXSfcease}, "SiFive sf.cease"},
- {DecoderTableXmipslsp32, {RISCV::FeatureVendorXMIPSLSP}, "MIPS mips.lsp"},
- {DecoderTableXmipscmov32,
- {RISCV::FeatureVendorXMIPSCMov},
- "MIPS mips.ccmov"},
- {DecoderTableXmipscbop32,
- {RISCV::FeatureVendorXMIPSCBOP},
- "MIPS mips.pref"},
+ {DecoderTableXMIPS32, XMIPSGroup, "Mips extensions"},
{DecoderTableXAndes32, XAndesGroup, "Andes extensions"},
{DecoderTableXSMT32, XSMTGroup, "SpacemiT extensions"},
// Standard Extensions
@@ -700,6 +696,14 @@ static constexpr DecoderListEntry DecoderList32[]{
{DecoderTableZdinxRV32Only32, {}, "RV32-only Zdinx (Double in Integer)"},
};
+namespace {
+// Define bitwidths for various types used to instantiate the decoder.
+template <> constexpr uint32_t InsnBitWidth<uint16_t> = 16;
+template <> constexpr uint32_t InsnBitWidth<uint32_t> = 32;
+// Use uint64_t to represent 48 bit instructions.
+template <> constexpr uint32_t InsnBitWidth<uint64_t> = 48;
+} // namespace
+
DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes,
uint64_t Address,
@@ -710,9 +714,7 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
}
Size = 4;
- // Use uint64_t to match getInstruction48. decodeInstruction is templated
- // on the Insn type.
- uint64_t Insn = support::endian::read32le(Bytes.data());
+ uint32_t Insn = support::endian::read32le(Bytes.data());
for (const DecoderListEntry &Entry : DecoderList32) {
if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -758,9 +760,7 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
}
Size = 2;
- // Use uint64_t to match getInstruction48. decodeInstruction is templated
- // on the Insn type.
- uint64_t Insn = support::endian::read16le(Bytes.data());
+ uint16_t Insn = support::endian::read16le(Bytes.data());
for (const DecoderListEntry &Entry : DecoderList16) {
if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -769,12 +769,8 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
LLVM_DEBUG(dbgs() << "Trying " << Entry.Desc << " table:\n");
DecodeStatus Result =
decodeInstruction(Entry.Table, MI, Insn, Address, this, STI);
- if (Result == MCDisassembler::Fail)
- continue;
-
- addSPOperands(MI);
-
- return Result;
+ if (Result != MCDisassembler::Fail)
+ return Result;
}
return MCDisassembler::Fail;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 51ea3fc5f677..7df1b7e58000 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -1158,8 +1158,8 @@ bool RISCVInstructionSelector::selectAddr(MachineInstr &MI,
switch (TM.getCodeModel()) {
default: {
- reportGISelFailure(const_cast<MachineFunction &>(*MF), *TPC, *MORE,
- getName(), "Unsupported code model for lowering", MI);
+ reportGISelFailure(*MF, *TPC, *MORE, getName(),
+ "Unsupported code model for lowering", MI);
return false;
}
case CodeModel::Small: {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index e88f33d6859e..564657ac65fd 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -26,6 +26,8 @@
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/IR/Type.h"
using namespace llvm;
@@ -152,7 +154,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
getActionDefinitionsBuilder({G_SADDO, G_SSUBO}).minScalar(0, sXLen).lower();
// TODO: Use Vector Single-Width Saturating Instructions for vector types.
- getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
+ getActionDefinitionsBuilder(
+ {G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT, G_SSHLSAT, G_USHLSAT})
.lower();
getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
@@ -485,6 +488,10 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
.minScalar(ST.hasStdExtZbb(), 0, sXLen)
.lower();
+ getActionDefinitionsBuilder({G_ABDS, G_ABDU})
+ .minScalar(ST.hasStdExtZbb(), 0, sXLen)
+ .lower();
+
getActionDefinitionsBuilder({G_UMAX, G_UMIN, G_SMAX, G_SMIN})
.legalFor(ST.hasStdExtZbb(), {sXLen})
.minScalar(ST.hasStdExtZbb(), 0, sXLen)
@@ -692,6 +699,16 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
.customIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)));
+ getActionDefinitionsBuilder(G_ATOMICRMW_ADD)
+ .legalFor(ST.hasStdExtA(), {{sXLen, p0}})
+ .libcallFor(!ST.hasStdExtA(), {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
+ .clampScalar(0, sXLen, sXLen);
+
+ getActionDefinitionsBuilder(G_ATOMICRMW_SUB)
+ .libcallFor(!ST.hasStdExtA(), {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
+ .clampScalar(0, sXLen, sXLen)
+ .lower();
+
getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
@@ -729,6 +746,9 @@ bool RISCVLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.eraseFromParent();
return true;
}
+ case Intrinsic::riscv_masked_atomicrmw_add:
+ case Intrinsic::riscv_masked_atomicrmw_sub:
+ return true;
}
}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 543c4c5ddfc9..37fe32531800 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -36,6 +36,12 @@ RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S,
setFlagsFromFeatures(STI);
}
+RISCVELFStreamer::RISCVELFStreamer(MCContext &C,
+ std::unique_ptr<MCAsmBackend> MAB,
+ std::unique_ptr<MCObjectWriter> MOW,
+ std::unique_ptr<MCCodeEmitter> MCE)
+ : MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {}
+
RISCVELFStreamer &RISCVTargetELFStreamer::getStreamer() {
return static_cast<RISCVELFStreamer &>(Streamer);
}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 98948cd3e949..26da2441d4ae 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -28,8 +28,7 @@ class RISCVELFStreamer : public MCELFStreamer {
public:
RISCVELFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> MAB,
std::unique_ptr<MCObjectWriter> MOW,
- std::unique_ptr<MCCodeEmitter> MCE)
- : MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {}
+ std::unique_ptr<MCCodeEmitter> MCE);
void changeSection(MCSection *Section, uint32_t Subsection) override;
void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index b0c27ce6010f..50f5a5d09a69 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -216,9 +216,12 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
unsigned Imm = MI->getOperand(OpNo).getImm();
// Print the raw immediate for reserved values: vlmul[2:0]=4, vsew[2:0]=0b1xx,
- // or non-zero in bits 8 and above.
+ // altfmt=1 without zvfbfa extension, or non-zero in bits 9 and above.
if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED ||
- RISCVVType::getSEW(Imm) > 64 || (Imm >> 8) != 0) {
+ RISCVVType::getSEW(Imm) > 64 ||
+ (RISCVVType::isAltFmt(Imm) &&
+ !STI.hasFeature(RISCV::FeatureStdExtZvfbfa)) ||
+ (Imm >> 9) != 0) {
O << formatImm(Imm);
return;
}
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 83566b1c5778..66ca43604670 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -126,7 +126,7 @@ private:
void LowerPATCHABLE_TAIL_CALL(const MachineInstr *MI);
void emitSled(const MachineInstr *MI, SledKind Kind);
- bool lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
+ void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
};
}
@@ -329,12 +329,17 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
case TargetOpcode::STATEPOINT:
return LowerSTATEPOINT(*OutStreamer, SM, *MI);
case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
- // patchable-function-entry is handled in lowerToMCInst
- // Therefore, we break out of the switch statement if we encounter it here.
const Function &F = MI->getParent()->getParent()->getFunction();
- if (F.hasFnAttribute("patchable-function-entry"))
- break;
-
+ if (F.hasFnAttribute("patchable-function-entry")) {
+ unsigned Num;
+ [[maybe_unused]] bool Result =
+ F.getFnAttribute("patchable-function-entry")
+ .getValueAsString()
+ .getAsInteger(10, Num);
+ assert(!Result && "Enforced by the verifier");
+ emitNops(Num);
+ return;
+ }
LowerPATCHABLE_FUNCTION_ENTER(MI);
return;
}
@@ -347,8 +352,8 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
}
MCInst OutInst;
- if (!lowerToMCInst(MI, OutInst))
- EmitToStreamer(*OutStreamer, OutInst);
+ lowerToMCInst(MI, OutInst);
+ EmitToStreamer(*OutStreamer, OutInst);
}
bool RISCVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -1174,9 +1179,9 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
return true;
}
-bool RISCVAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
+void RISCVAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
if (lowerRISCVVMachineInstrToMCInst(MI, OutMI, STI))
- return false;
+ return;
OutMI.setOpcode(MI->getOpcode());
@@ -1185,23 +1190,6 @@ bool RISCVAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
if (lowerOperand(MO, MCOp))
OutMI.addOperand(MCOp);
}
-
- switch (OutMI.getOpcode()) {
- case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
- const Function &F = MI->getParent()->getParent()->getFunction();
- if (F.hasFnAttribute("patchable-function-entry")) {
- unsigned Num;
- if (F.getFnAttribute("patchable-function-entry")
- .getValueAsString()
- .getAsInteger(10, Num))
- return false;
- emitNops(Num);
- return true;
- }
- break;
- }
- }
- return false;
}
void RISCVAsmPrinter::emitMachineConstantPoolValue(
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3b738e4cc11a..063963d4ec36 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -680,6 +680,13 @@ def FeatureStdExtV
[FeatureStdExtZvl128b, FeatureStdExtZve64d]>,
RISCVExtensionBitmask<0, 21>;
+def FeatureStdExtZvfbfa
+ : RISCVExperimentalExtension<0, 1, "Additional BF16 vector compute support",
+ [FeatureStdExtZve32f, FeatureStdExtZfbfmin]>;
+def HasStdExtZvfbfa : Predicate<"Subtarget->hasStdExtZvfbfa()">,
+ AssemblerPredicate<(all_of FeatureStdExtZvfbfa),
+ "'Zvfbfa' (Additional BF16 vector compute support)">;
+
def FeatureStdExtZvfbfmin
: RISCVExtension<1, 0, "Vector BF16 Converts", [FeatureStdExtZve32f]>;
def HasStdExtZvfbfmin : Predicate<"Subtarget->hasStdExtZvfbfmin()">,
@@ -1396,20 +1403,27 @@ def HasVendorXMIPSCMov
AssemblerPredicate<(all_of FeatureVendorXMIPSCMov),
"'Xmipscmov' ('mips.ccmov' instruction)">;
def UseCCMovInsn : Predicate<"Subtarget->useCCMovInsn()">;
+
def FeatureVendorXMIPSLSP
: RISCVExtension<1, 0, "MIPS optimization for hardware load-store bonding">;
def HasVendorXMIPSLSP
: Predicate<"Subtarget->hasVendorXMIPSLSP()">,
AssemblerPredicate<(all_of FeatureVendorXMIPSLSP),
"'Xmipslsp' (load and store pair instructions)">;
-def FeatureVendorXMIPSCBOP
- : RISCVExtension<1, 0, "MIPS Software Prefetch">;
+
+def FeatureVendorXMIPSCBOP : RISCVExtension<1, 0, "MIPS Software Prefetch">;
def HasVendorXMIPSCBOP
: Predicate<"Subtarget->hasVendorXMIPSCBOP()">,
AssemblerPredicate<(all_of FeatureVendorXMIPSCBOP),
"'Xmipscbop' (MIPS hardware prefetch)">;
def NoVendorXMIPSCBOP : Predicate<"!Subtarget->hasVendorXMIPSCBOP()">;
+def FeatureVendorXMIPSEXECTL : RISCVExtension<1, 0, "MIPS execution control">;
+def HasVendorXMIPSEXECTL
+ : Predicate<"Subtarget->hasVendorXMIPSEXT()">,
+ AssemblerPredicate<(all_of FeatureVendorXMIPSEXECTL),
+ "'Xmipsexectl' (MIPS execution control)">;
+
// WCH / Nanjing Qinheng Microelectronics Extension(s)
def FeatureVendorXwchc
@@ -1668,7 +1682,7 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
"RV32I Base Instruction Set">;
defvar RV32 = DefaultMode;
-def RV64 : HwMode<"+64bit", [IsRV64]>;
+def RV64 : HwMode<[IsRV64]>;
def FeatureRelax
: SubtargetFeature<"relax", "EnableLinkerRelax", "true",
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 9fc0d815ceee..06ce91771c9e 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -106,8 +106,14 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL) {
const auto &STI = MF.getSubtarget<RISCVSubtarget>();
+ // We check Zimop instead of (Zimop || Zcmop) to determine whether HW shadow
+ // stack is available despite the fact that sspush/sspopchk both have a
+ // compressed form, because if only Zcmop is available, we would need to
+ // reserve X5 due to c.sspopchk only takes X5 and we currently do not support
+ // using X5 as the return address register.
+ // However, we can still aggressively use c.sspush x1 if zcmop is available.
bool HasHWShadowStack = MF.getFunction().hasFnAttribute("hw-shadow-stack") &&
- STI.hasStdExtZicfiss();
+ STI.hasStdExtZimop();
bool HasSWShadowStack =
MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
if (!HasHWShadowStack && !HasSWShadowStack)
@@ -124,7 +130,12 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
const RISCVInstrInfo *TII = STI.getInstrInfo();
if (HasHWShadowStack) {
- BuildMI(MBB, MI, DL, TII->get(RISCV::SSPUSH)).addReg(RAReg);
+ if (STI.hasStdExtZcmop()) {
+ static_assert(RAReg == RISCV::X1, "C.SSPUSH only accepts X1");
+ BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoMOP_C_SSPUSH));
+ } else {
+ BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoMOP_SSPUSH)).addReg(RAReg);
+ }
return;
}
@@ -172,7 +183,7 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
const DebugLoc &DL) {
const auto &STI = MF.getSubtarget<RISCVSubtarget>();
bool HasHWShadowStack = MF.getFunction().hasFnAttribute("hw-shadow-stack") &&
- STI.hasStdExtZicfiss();
+ STI.hasStdExtZimop();
bool HasSWShadowStack =
MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
if (!HasHWShadowStack && !HasSWShadowStack)
@@ -186,7 +197,7 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
const RISCVInstrInfo *TII = STI.getInstrInfo();
if (HasHWShadowStack) {
- BuildMI(MBB, MI, DL, TII->get(RISCV::SSPOPCHK)).addReg(RAReg);
+ BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoMOP_SSPOPCHK)).addReg(RAReg);
return;
}
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index 80a48c5ec11f..52dc53e4545e 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -561,7 +561,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
EVL = Builder.CreateElementCount(
Builder.getInt32Ty(), cast<VectorType>(DataType)->getElementCount());
- CallInst *Call;
+ Value *Call;
if (!StoreVal) {
Call = Builder.CreateIntrinsic(
@@ -571,8 +571,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
// Merge llvm.masked.gather's passthru
if (II->getIntrinsicID() == Intrinsic::masked_gather)
- Call = Builder.CreateIntrinsic(Intrinsic::vp_select, {DataType},
- {Mask, Call, II->getArgOperand(3), EVL});
+ Call = Builder.CreateSelect(Mask, Call, II->getArgOperand(3));
} else
Call = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_store,
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index f9f35f66319b..c7f15415ebb9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -819,49 +819,6 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
return false;
}
-// (xor X, (and (xor X, C1), C2))
-// -> (qc.insbi X, (C1 >> ShAmt), Width, ShAmt)
-// where C2 is a shifted mask with width=Width and shift=ShAmt
-bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromXor(SDNode *Node) {
-
- if (!Subtarget->hasVendorXqcibm())
- return false;
-
- using namespace SDPatternMatch;
-
- SDValue X;
- APInt CImm, CMask;
- if (!sd_match(
- Node,
- m_Xor(m_Value(X),
- m_OneUse(m_And(m_OneUse(m_Xor(m_Deferred(X), m_ConstInt(CImm))),
- m_ConstInt(CMask))))))
- return false;
-
- unsigned Width, ShAmt;
- if (!CMask.isShiftedMask(ShAmt, Width))
- return false;
-
- int64_t Imm = CImm.getSExtValue();
- Imm >>= ShAmt;
-
- SDLoc DL(Node);
- SDValue ImmNode;
- auto Opc = RISCV::QC_INSB;
-
- if (isInt<5>(Imm)) {
- Opc = RISCV::QC_INSBI;
- ImmNode = CurDAG->getSignedTargetConstant(Imm, DL, MVT::i32);
- } else {
- ImmNode = selectImm(CurDAG, DL, MVT::i32, Imm, *Subtarget);
- }
- SDValue Ops[] = {X, ImmNode, CurDAG->getTargetConstant(Width, DL, MVT::i32),
- CurDAG->getTargetConstant(ShAmt, DL, MVT::i32)};
- ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, MVT::i32, Ops));
-
- return true;
-}
-
bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node,
const SDLoc &DL, MVT VT,
SDValue X, unsigned Msb,
@@ -1095,7 +1052,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
SDLoc DL(Node);
MVT VT = Node->getSimpleValueType(0);
- bool HasBitTest = Subtarget->hasStdExtZbs() || Subtarget->hasVendorXTHeadBs();
+ bool HasBitTest = Subtarget->hasBEXTILike();
switch (Opcode) {
case ISD::Constant: {
@@ -1442,9 +1399,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (tryShrinkShlLogicImm(Node))
return;
- if (tryBitfieldInsertOpFromXor(Node))
- return;
-
break;
case ISD::AND: {
auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
@@ -2951,6 +2905,65 @@ static bool isWorthFoldingAdd(SDValue Add) {
return true;
}
+bool isRegImmLoadOrStore(SDNode *User, SDValue Add) {
+ switch (User->getOpcode()) {
+ default:
+ return false;
+ case ISD::LOAD:
+ case RISCVISD::LD_RV32:
+ case ISD::ATOMIC_LOAD:
+ break;
+ case ISD::STORE:
+ // Don't allow stores of Add. It must only be used as the address.
+ if (cast<StoreSDNode>(User)->getValue() == Add)
+ return false;
+ break;
+ case RISCVISD::SD_RV32:
+ // Don't allow stores of Add. It must only be used as the address.
+ if (User->getOperand(0) == Add || User->getOperand(1) == Add)
+ return false;
+ break;
+ case ISD::ATOMIC_STORE:
+ // Don't allow stores of Add. It must only be used as the address.
+ if (cast<AtomicSDNode>(User)->getVal() == Add)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+// To prevent SelectAddrRegImm from folding offsets that conflict with the
+// fusion of PseudoMovAddr, check if the offset of every use of a given address
+// is within the alignment.
+bool RISCVDAGToDAGISel::areOffsetsWithinAlignment(SDValue Addr,
+ Align Alignment) {
+ assert(Addr->getOpcode() == RISCVISD::ADD_LO);
+ for (auto *User : Addr->users()) {
+ // If the user is a load or store, then the offset is 0 which is always
+ // within alignment.
+ if (isRegImmLoadOrStore(User, Addr))
+ continue;
+
+ if (CurDAG->isBaseWithConstantOffset(SDValue(User, 0))) {
+ int64_t CVal = cast<ConstantSDNode>(User->getOperand(1))->getSExtValue();
+ if (!isInt<12>(CVal) || Alignment <= CVal)
+ return false;
+
+ // Make sure all uses are foldable load/stores.
+ for (auto *AddUser : User->users())
+ if (!isRegImmLoadOrStore(AddUser, SDValue(User, 0)))
+ return false;
+
+ continue;
+ }
+
+ return false;
+ }
+
+ return true;
+}
+
bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
SDValue &Offset) {
if (SelectAddrFrameIndex(Addr, Base, Offset))
@@ -2960,9 +2973,21 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
MVT VT = Addr.getSimpleValueType();
if (Addr.getOpcode() == RISCVISD::ADD_LO) {
- Base = Addr.getOperand(0);
- Offset = Addr.getOperand(1);
- return true;
+ bool CanFold = true;
+ // Unconditionally fold if operand 1 is not a global address (e.g.
+ // externsymbol)
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(Addr.getOperand(1))) {
+ const DataLayout &DL = CurDAG->getDataLayout();
+ Align Alignment = commonAlignment(
+ GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
+ if (!areOffsetsWithinAlignment(Addr, Alignment))
+ CanFold = false;
+ }
+ if (CanFold) {
+ Base = Addr.getOperand(0);
+ Offset = Addr.getOperand(1);
+ return true;
+ }
}
if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -2980,7 +3005,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
const DataLayout &DL = CurDAG->getDataLayout();
Align Alignment = commonAlignment(
GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
- if ((CVal == 0 || Alignment > CVal)) {
+ if ((CVal == 0 || Alignment > CVal) &&
+ areOffsetsWithinAlignment(Base, Alignment)) {
int64_t CombinedOffset = CVal + GA->getOffset();
Base = Base.getOperand(0);
Offset = CurDAG->getTargetGlobalAddress(
@@ -3983,6 +4009,15 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
if (Use.getOperandNo() == 0 && Bits >= 32)
break;
return false;
+ case RISCV::TH_EXT:
+ case RISCV::TH_EXTU: {
+ unsigned Msb = User->getConstantOperandVal(1);
+ unsigned Lsb = User->getConstantOperandVal(2);
+ // Behavior of Msb < Lsb is not well documented.
+ if (Msb >= Lsb && Bits > Msb)
+ break;
+ return false;
+ }
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index c329a4c6ec62..cf2f763abc06 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -45,6 +45,8 @@ public:
InlineAsm::ConstraintCode ConstraintID,
std::vector<SDValue> &OutOps) override;
+ bool areOffsetsWithinAlignment(SDValue Addr, Align Alignment);
+
bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectAddrRegImm9(SDValue Addr, SDValue &Base, SDValue &Offset);
@@ -75,7 +77,6 @@ public:
bool trySignedBitfieldExtract(SDNode *Node);
bool trySignedBitfieldInsertInSign(SDNode *Node);
bool trySignedBitfieldInsertInMask(SDNode *Node);
- bool tryBitfieldInsertOpFromXor(SDNode *Node);
bool tryBitfieldInsertOpFromOrAndImm(SDNode *Node);
bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT,
SDValue X, unsigned Msb, unsigned Lsb);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a33224845e2b..a68a3c14dc41 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2173,7 +2173,7 @@ bool RISCVTargetLowering::isMaskAndCmp0FoldingBeneficial(
// on the basis that it's possible the sinking+duplication of the AND in
// CodeGenPrepare triggered by this hook wouldn't decrease the instruction
// count and would increase code size (e.g. ANDI+BNEZ => BEXTI+BNEZ).
- if (!Subtarget.hasStdExtZbs() && !Subtarget.hasVendorXTHeadBs())
+ if (!Subtarget.hasBEXTILike())
return false;
ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
if (!Mask)
@@ -3744,9 +3744,11 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
// different
// FIXME: Support i1 vectors, maybe by promoting to i8?
MVT EltTy = VT.getVectorElementType();
+ if (EltTy == MVT::i1 ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
+ return SDValue();
MVT SrcVT = Src.getSimpleValueType();
- if (EltTy == MVT::i1 || EltTy != SrcVT.getVectorElementType() ||
- !DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
+ if (EltTy != SrcVT.getVectorElementType())
return SDValue();
SDValue Idx = SplatVal.getOperand(1);
// The index must be a legal type.
@@ -4518,41 +4520,104 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
+ // General case: splat the first operand and slide other operands down one
+ // by one to form a vector. Alternatively, if every operand is an
+ // extraction from element 0 of a vector, we use that vector from the last
+ // extraction as the start value and slide up instead of slide down. Such that
+ // (1) we can avoid the initial splat (2) we can turn those vslide1up into
+ // vslideup of 1 later and eliminate the vector to scalar movement, which is
+ // something we cannot do with vslide1down/vslidedown.
+ // Of course, using vslide1up/vslideup might increase the register pressure,
+ // and that's why we conservatively limit to cases where every operand is an
+ // extraction from the first element.
+ SmallVector<SDValue> Operands(Op->op_begin(), Op->op_end());
+ SDValue EVec;
+ bool SlideUp = false;
+ auto getVSlide = [&](EVT ContainerVT, SDValue Passthru, SDValue Vec,
+ SDValue Offset, SDValue Mask, SDValue VL) -> SDValue {
+ if (SlideUp)
+ return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
+ Mask, VL, Policy);
+ return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
+ Mask, VL, Policy);
+ };
+
+ // The reason we don't use all_of here is because we're also capturing EVec
+ // from the last non-undef operand. If the std::execution_policy of the
+ // underlying std::all_of is anything but std::sequenced_policy we might
+ // capture the wrong EVec.
+ for (SDValue V : Operands) {
+ using namespace SDPatternMatch;
+ SlideUp = V.isUndef() || sd_match(V, m_ExtractElt(m_Value(EVec), m_Zero()));
+ if (!SlideUp)
+ break;
+ }
+
+ if (SlideUp) {
+ MVT EVecContainerVT = EVec.getSimpleValueType();
+ // Make sure the original vector has scalable vector type.
+ if (EVecContainerVT.isFixedLengthVector()) {
+ EVecContainerVT =
+ getContainerForFixedLengthVector(DAG, EVecContainerVT, Subtarget);
+ EVec = convertToScalableVector(EVecContainerVT, EVec, DAG, Subtarget);
+ }
+
+ // Adapt EVec's type into ContainerVT.
+ if (EVecContainerVT.getVectorMinNumElements() <
+ ContainerVT.getVectorMinNumElements())
+ EVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), EVec, 0);
+ else
+ EVec = DAG.getExtractSubvector(DL, ContainerVT, EVec, 0);
+
+ // Reverse the elements as we're going to slide up from the last element.
+ std::reverse(Operands.begin(), Operands.end());
+ }
+
SDValue Vec;
UndefCount = 0;
- for (SDValue V : Op->ops()) {
+ for (SDValue V : Operands) {
if (V.isUndef()) {
UndefCount++;
continue;
}
- // Start our sequence with a TA splat in the hopes that hardware is able to
- // recognize there's no dependency on the prior value of our temporary
- // register.
+ // Start our sequence with either a TA splat or extract source in the
+ // hopes that hardware is able to recognize there's no dependency on the
+ // prior value of our temporary register.
if (!Vec) {
- Vec = DAG.getSplatVector(VT, DL, V);
- Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ if (SlideUp) {
+ Vec = EVec;
+ } else {
+ Vec = DAG.getSplatVector(VT, DL, V);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
+
UndefCount = 0;
continue;
}
if (UndefCount) {
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
- Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
- Vec, Offset, Mask, VL, Policy);
+ Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
+ VL);
UndefCount = 0;
}
- auto OpCode =
- VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL;
+
+ unsigned Opcode;
+ if (VT.isFloatingPoint())
+ Opcode = SlideUp ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VFSLIDE1DOWN_VL;
+ else
+ Opcode = SlideUp ? RISCVISD::VSLIDE1UP_VL : RISCVISD::VSLIDE1DOWN_VL;
+
if (!VT.isFloatingPoint())
V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V);
- Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
+ Vec = DAG.getNode(Opcode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
V, Mask, VL);
}
if (UndefCount) {
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
- Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
- Vec, Offset, Mask, VL, Policy);
+ Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
+ VL);
}
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
}
@@ -8193,6 +8258,13 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
DL, VT, LHS, DAG.getSignedConstant(Imm + 1, DL, OpVT), CCVal);
return DAG.getLogicalNOT(DL, SetCC, VT);
}
+ // Lower (setugt X, 2047) as (setne (srl X, 11), 0).
+ if (CCVal == ISD::SETUGT && Imm == 2047) {
+ SDValue Shift = DAG.getNode(ISD::SRL, DL, OpVT, LHS,
+ DAG.getShiftAmountConstant(11, OpVT, DL));
+ return DAG.getSetCC(DL, VT, Shift, DAG.getConstant(0, DL, OpVT),
+ ISD::SETNE);
+ }
}
// Not a constant we could handle, swap the operands and condition code to
@@ -8815,7 +8887,15 @@ SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
reportFatalUsageError("Unsupported code model for lowering");
case CodeModel::Small: {
// Generate a sequence for accessing addresses within the first 2 GiB of
- // address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
+ // address space.
+ if (Subtarget.hasVendorXqcili()) {
+ // Use QC.E.LI to generate the address, as this is easier to relax than
+ // LUI/ADDI.
+ SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
+ return DAG.getNode(RISCVISD::QC_E_LI, DL, Ty, Addr);
+ }
+
+ // This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI);
SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO);
SDValue MNHi = DAG.getNode(RISCVISD::HI, DL, Ty, AddrHi);
@@ -9036,8 +9116,12 @@ static std::optional<bool> matchSetCC(SDValue LHS, SDValue RHS,
return std::nullopt;
}
-static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
+static bool isSimm12Constant(SDValue V) {
+ return isa<ConstantSDNode>(V) && V->getAsAPIntVal().isSignedIntN(12);
+}
+
+static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
SDValue CondV = N->getOperand(0);
SDValue TrueV = N->getOperand(1);
SDValue FalseV = N->getOperand(2);
@@ -9057,14 +9141,16 @@ static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::OR, DL, VT, Neg, DAG.getFreeze(TrueV));
}
+ const bool HasCZero = VT.isScalarInteger() && Subtarget.hasCZEROLike();
+
// (select c, 0, y) -> (c-1) & y
- if (isNullConstant(TrueV)) {
- SDValue Neg = DAG.getNode(ISD::ADD, DL, VT, CondV,
- DAG.getAllOnesConstant(DL, VT));
+ if (isNullConstant(TrueV) && (!HasCZero || isSimm12Constant(FalseV))) {
+ SDValue Neg =
+ DAG.getNode(ISD::ADD, DL, VT, CondV, DAG.getAllOnesConstant(DL, VT));
return DAG.getNode(ISD::AND, DL, VT, Neg, DAG.getFreeze(FalseV));
}
// (select c, y, 0) -> -c & y
- if (isNullConstant(FalseV)) {
+ if (isNullConstant(FalseV) && (!HasCZero || isSimm12Constant(TrueV))) {
SDValue Neg = DAG.getNegative(CondV, DL, VT);
return DAG.getNode(ISD::AND, DL, VT, Neg, DAG.getFreeze(TrueV));
}
@@ -9185,12 +9271,16 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::VSELECT, DL, VT, CondSplat, TrueV, FalseV);
}
+ // Try some other optimizations before falling back to generic lowering.
+ if (SDValue V = lowerSelectToBinOp(Op.getNode(), DAG, Subtarget))
+ return V;
+
// When Zicond or XVentanaCondOps is present, emit CZERO_EQZ and CZERO_NEZ
// nodes to implement the SELECT. Performing the lowering here allows for
// greater control over when CZERO_{EQZ/NEZ} are used vs another branchless
// sequence or RISCVISD::SELECT_CC node (branch-based select).
- if ((Subtarget.hasStdExtZicond() || Subtarget.hasVendorXVentanaCondOps()) &&
- VT.isScalarInteger()) {
+ if (Subtarget.hasCZEROLike() && VT.isScalarInteger()) {
+
// (select c, t, 0) -> (czero_eqz t, c)
if (isNullConstant(FalseV))
return DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV);
@@ -9244,10 +9334,6 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV));
}
- // Try some other optimizations before falling back to generic lowering.
- if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget))
- return V;
-
// (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1)
// (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2)
if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV)) {
@@ -9280,19 +9366,38 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
}
- const int TrueValCost = RISCVMatInt::getIntMatCost(
- TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
- const int FalseValCost = RISCVMatInt::getIntMatCost(
- FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
- bool IsCZERO_NEZ = TrueValCost <= FalseValCost;
+ // Use SHL/ADDI (and possible XORI) to avoid having to materialize
+ // a constant in register
+ if ((TrueVal - FalseVal).isPowerOf2() && FalseVal.isSignedIntN(12)) {
+ SDValue Log2 = DAG.getConstant((TrueVal - FalseVal).logBase2(), DL, VT);
+ SDValue BitDiff = DAG.getNode(ISD::SHL, DL, VT, CondV, Log2);
+ return DAG.getNode(ISD::ADD, DL, VT, FalseV, BitDiff);
+ }
+ if ((FalseVal - TrueVal).isPowerOf2() && TrueVal.isSignedIntN(12)) {
+ SDValue Log2 = DAG.getConstant((FalseVal - TrueVal).logBase2(), DL, VT);
+ CondV = DAG.getLogicalNOT(DL, CondV, CondV->getValueType(0));
+ SDValue BitDiff = DAG.getNode(ISD::SHL, DL, VT, CondV, Log2);
+ return DAG.getNode(ISD::ADD, DL, VT, TrueV, BitDiff);
+ }
+
+ auto getCost = [&](const APInt &Delta, const APInt &Addend) {
+ const int DeltaCost = RISCVMatInt::getIntMatCost(
+ Delta, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+ // Does the addend fold into an ADDI
+ if (Addend.isSignedIntN(12))
+ return DeltaCost;
+ const int AddendCost = RISCVMatInt::getIntMatCost(
+ Addend, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+ return AddendCost + DeltaCost;
+ };
+ bool IsCZERO_NEZ = getCost(FalseVal - TrueVal, TrueVal) <=
+ getCost(TrueVal - FalseVal, FalseVal);
SDValue LHSVal = DAG.getConstant(
IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT);
- SDValue RHSVal =
- DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT);
SDValue CMOV =
DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
DL, VT, LHSVal, CondV);
- return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal);
+ return DAG.getNode(ISD::ADD, DL, VT, CMOV, IsCZERO_NEZ ? TrueV : FalseV);
}
// (select c, c1, t) -> (add (czero_nez t - c1, c), c1)
@@ -9327,12 +9432,10 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(
ISD::OR, DL, VT,
DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV),
- DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV, CondV));
+ DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV, CondV),
+ SDNodeFlags::Disjoint);
}
- if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget))
- return V;
-
if (Op.hasOneUse()) {
unsigned UseOpc = Op->user_begin()->getOpcode();
if (isBinOp(UseOpc) && DAG.isSafeToSpeculativelyExecute(UseOpc)) {
@@ -10738,11 +10841,11 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1));
}
case Intrinsic::riscv_mopr:
- return DAG.getNode(RISCVISD::MOPR, DL, XLenVT, Op.getOperand(1),
+ return DAG.getNode(RISCVISD::MOP_R, DL, XLenVT, Op.getOperand(1),
Op.getOperand(2));
case Intrinsic::riscv_moprr: {
- return DAG.getNode(RISCVISD::MOPRR, DL, XLenVT, Op.getOperand(1),
+ return DAG.getNode(RISCVISD::MOP_RR, DL, XLenVT, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
}
case Intrinsic::riscv_clmul:
@@ -14877,7 +14980,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
SDValue NewOp =
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
SDValue Res = DAG.getNode(
- RISCVISD::MOPR, DL, MVT::i64, NewOp,
+ RISCVISD::MOP_R, DL, MVT::i64, NewOp,
DAG.getTargetConstant(N->getConstantOperandVal(2), DL, MVT::i64));
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
return;
@@ -14890,7 +14993,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
SDValue NewOp1 =
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
SDValue Res = DAG.getNode(
- RISCVISD::MOPRR, DL, MVT::i64, NewOp0, NewOp1,
+ RISCVISD::MOP_RR, DL, MVT::i64, NewOp0, NewOp1,
DAG.getTargetConstant(N->getConstantOperandVal(3), DL, MVT::i64));
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
return;
@@ -15381,9 +15484,7 @@ static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
if (!Subtarget.hasConditionalMoveFusion()) {
// (select cond, x, (and x, c)) has custom lowering with Zicond.
- if ((!Subtarget.hasStdExtZicond() &&
- !Subtarget.hasVendorXVentanaCondOps()) ||
- N->getOpcode() != ISD::AND)
+ if (!Subtarget.hasCZEROLike() || N->getOpcode() != ISD::AND)
return SDValue();
// Maybe harmful when condition code has multiple use.
@@ -16059,12 +16160,55 @@ static SDValue combineOrOfCZERO(SDNode *N, SDValue N0, SDValue N1,
SDValue NewN0 = DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV.getOperand(0),
Cond);
- SDValue NewN1 = DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV.getOperand(0),
- Cond);
- SDValue NewOr = DAG.getNode(ISD::OR, DL, VT, NewN0, NewN1);
+ SDValue NewN1 =
+ DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV.getOperand(0), Cond);
+ SDValue NewOr =
+ DAG.getNode(ISD::OR, DL, VT, NewN0, NewN1, SDNodeFlags::Disjoint);
return DAG.getNode(ISD::XOR, DL, VT, NewOr, TrueV.getOperand(1));
}
+// (xor X, (xor (and X, C2), Y))
+// ->(qc_insb X, (sra Y, ShAmt), Width, ShAmt)
+// where C2 is a shifted mask with width = Width and shift = ShAmt
+// qc_insb might become qc.insb or qc.insbi depending on the operands.
+static SDValue combineXorToBitfieldInsert(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ if (!Subtarget.hasVendorXqcibm())
+ return SDValue();
+
+ using namespace SDPatternMatch;
+
+ SDValue Base, Inserted;
+ APInt CMask;
+ if (!sd_match(N, m_Xor(m_Value(Base),
+ m_OneUse(m_Xor(m_OneUse(m_And(m_Deferred(Base),
+ m_ConstInt(CMask))),
+ m_Value(Inserted))))))
+ return SDValue();
+
+ if (N->getValueType(0) != MVT::i32)
+ return SDValue();
+
+ unsigned Width, ShAmt;
+ if (!CMask.isShiftedMask(ShAmt, Width))
+ return SDValue();
+
+ // Check if all zero bits in CMask are also zero in Inserted
+ if (!DAG.MaskedValueIsZero(Inserted, ~CMask))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // `Inserted` needs to be right shifted before it is put into the
+ // instruction.
+ Inserted = DAG.getNode(ISD::SRA, DL, MVT::i32, Inserted,
+ DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
+
+ SDValue Ops[] = {Base, Inserted, DAG.getConstant(Width, DL, MVT::i32),
+ DAG.getConstant(ShAmt, DL, MVT::i32)};
+ return DAG.getNode(RISCVISD::QC_INSB, DL, MVT::i32, Ops);
+}
+
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const RISCVSubtarget &Subtarget) {
SelectionDAG &DAG = DCI.DAG;
@@ -16108,8 +16252,8 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N0.getOperand(0));
SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N0.getOperand(1));
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i64, Op0, Op1);
- SDValue And = DAG.getNOT(DL, Shl, MVT::i64);
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, And);
+ SDValue Not = DAG.getNOT(DL, Shl, MVT::i64);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Not);
}
// fold (xor (sllw 1, x), -1) -> (rolw ~1, x)
@@ -16137,6 +16281,9 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ if (SDValue V = combineXorToBitfieldInsert(N, DAG, Subtarget))
+ return V;
+
if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
return V;
if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
@@ -16590,10 +16737,6 @@ combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
DAG.getConstant(0, DL, XLenVT), CC);
}
-// Replace (seteq (i64 (and X, 0xffffffff)), C1) with
-// (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
-// bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
-// can become a sext.w instead of a shift pair.
static SDValue performSETCCCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const RISCVSubtarget &Subtarget) {
@@ -16613,20 +16756,44 @@ static SDValue performSETCCCombine(SDNode *N,
combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG, Subtarget))
return V;
- // (X & -4096) == 0 -> (X >> 12) == 0 if the AND constant can't use ANDI.
- if (DCI.isAfterLegalizeDAG() && isNullConstant(N1) &&
+ if (DCI.isAfterLegalizeDAG() && isa<ConstantSDNode>(N1) &&
N0.getOpcode() == ISD::AND && N0.hasOneUse() &&
isa<ConstantSDNode>(N0.getOperand(1))) {
- const APInt &AndRHSC =
- cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
- if (!isInt<12>(AndRHSC.getSExtValue()) && AndRHSC.isNegatedPowerOf2()) {
+ const APInt &AndRHSC = N0.getConstantOperandAPInt(1);
+ // (X & -(1 << C)) == 0 -> (X >> C) == 0 if the AND constant can't use ANDI.
+ if (isNullConstant(N1) && !isInt<12>(AndRHSC.getSExtValue()) &&
+ AndRHSC.isNegatedPowerOf2()) {
unsigned ShiftBits = AndRHSC.countr_zero();
- SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, N0.getOperand(0),
- DAG.getConstant(ShiftBits, dl, VT));
+ SDValue Shift = DAG.getNode(ISD::SRL, dl, OpVT, N0.getOperand(0),
+ DAG.getConstant(ShiftBits, dl, OpVT));
return DAG.getSetCC(dl, VT, Shift, N1, Cond);
}
+
+ // Similar to above but handling the lower 32 bits by using sraiw. Allow
+ // comparing with constants other than 0 if the constant can be folded into
+ // addi or xori after shifting.
+ uint64_t N1Int = cast<ConstantSDNode>(N1)->getZExtValue();
+ uint64_t AndRHSInt = AndRHSC.getZExtValue();
+ if (OpVT == MVT::i64 && AndRHSInt <= 0xffffffff &&
+ isPowerOf2_32(-uint32_t(AndRHSInt)) && (N1Int & AndRHSInt) == N1Int) {
+ unsigned ShiftBits = llvm::countr_zero(AndRHSInt);
+ int64_t NewC = SignExtend64<32>(N1Int) >> ShiftBits;
+ if (NewC >= -2048 && NewC <= 2048) {
+ SDValue SExt =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, OpVT, N0.getOperand(0),
+ DAG.getValueType(MVT::i32));
+ SDValue Shift = DAG.getNode(ISD::SRA, dl, OpVT, SExt,
+ DAG.getConstant(ShiftBits, dl, OpVT));
+ return DAG.getSetCC(dl, VT, Shift,
+ DAG.getSignedConstant(NewC, dl, OpVT), Cond);
+ }
+ }
}
+ // Replace (seteq (i64 (and X, 0xffffffff)), C1) with
+ // (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
+ // bit 31. Same for setne. C1' may be cheaper to materialize and the
+ // sext_inreg can become a sext.w instead of a shift pair.
if (OpVT != MVT::i64 || !Subtarget.is64Bit())
return SDValue();
@@ -18674,7 +18841,7 @@ static SDValue tryFoldSelectIntoOp(SDNode *N, SelectionDAG &DAG,
break;
}
- if (!TrueVal.hasOneUse() || isa<ConstantSDNode>(FalseVal))
+ if (!TrueVal.hasOneUse())
return SDValue();
unsigned OpToFold;
@@ -18746,6 +18913,10 @@ static SDValue foldSelectOfCTTZOrCTLZ(SDNode *N, SelectionDAG &DAG) {
if (Cond->getOperand(0) != CountZeroesArgument)
return SDValue();
+ unsigned BitWidth = CountZeroes.getValueSizeInBits();
+ if (!isPowerOf2_32(BitWidth))
+ return SDValue();
+
if (CountZeroes.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
CountZeroes = DAG.getNode(ISD::CTTZ, SDLoc(CountZeroes),
CountZeroes.getValueType(), CountZeroesArgument);
@@ -18754,7 +18925,6 @@ static SDValue foldSelectOfCTTZOrCTLZ(SDNode *N, SelectionDAG &DAG) {
CountZeroes.getValueType(), CountZeroesArgument);
}
- unsigned BitWidth = CountZeroes.getValueSizeInBits();
SDValue BitWidthMinusOne =
DAG.getConstant(BitWidth - 1, SDLoc(N), CountZeroes.getValueType());
@@ -18778,7 +18948,7 @@ static SDValue useInversedSetcc(SDNode *N, SelectionDAG &DAG,
// Replace (setcc eq (and x, C)) with (setcc ne (and x, C))) to generate
// BEXTI, where C is power of 2.
if (Subtarget.hasStdExtZbs() && VT.isScalarInteger() &&
- (Subtarget.hasStdExtZicond() || Subtarget.hasVendorXVentanaCondOps())) {
+ (Subtarget.hasCZEROLike() || Subtarget.hasVendorXTHeadCondMov())) {
SDValue LHS = Cond.getOperand(0);
SDValue RHS = Cond.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -18953,6 +19123,7 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL,
SelectionDAG &DAG,
const RISCVSubtarget &Subtarget,
const RISCVTargetLowering &TLI) {
+ using namespace SDPatternMatch;
// Note: We intentionally do not check the legality of the reduction type.
// We want to handle the m4/m8 *src* types, and thus need to let illegal
// intermediate types flow through here.
@@ -18960,11 +19131,10 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL,
!InVec.getValueType().getVectorElementCount().isKnownMultipleOf(4))
return SDValue();
- // Recurse through adds (since generic dag canonicalizes to that
- // form). TODO: Handle disjoint or here.
- if (InVec->getOpcode() == ISD::ADD) {
- SDValue A = InVec.getOperand(0);
- SDValue B = InVec.getOperand(1);
+ // Recurse through adds/disjoint ors (since generic dag canonicalizes to that
+ // form).
+ SDValue A, B;
+ if (sd_match(InVec, m_AddLike(m_Value(A), m_Value(B)))) {
SDValue AOpt = foldReduceOperandViaVQDOT(A, DL, DAG, Subtarget, TLI);
SDValue BOpt = foldReduceOperandViaVQDOT(B, DL, DAG, Subtarget, TLI);
if (AOpt || BOpt) {
@@ -19001,12 +19171,9 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL,
// mul (zext a, zext b) -> partial_reduce_umla 0, a, b
// mul (sext a, zext b) -> partial_reduce_ssmla 0, a, b
// mul (zext a, sext b) -> partial_reduce_smla 0, b, a (swapped)
- if (InVec.getOpcode() != ISD::MUL)
+ if (!sd_match(InVec, m_Mul(m_Value(A), m_Value(B))))
return SDValue();
- SDValue A = InVec.getOperand(0);
- SDValue B = InVec.getOperand(1);
-
if (!ISD::isExtOpcode(A.getOpcode()))
return SDValue();
@@ -20081,6 +20248,17 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return V;
break;
case ISD::FMUL: {
+ using namespace SDPatternMatch;
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue X, Y;
+ // InstCombine canonicalizes fneg (fmul x, y) -> fmul x, (fneg y), see
+ // hoistFNegAboveFMulFDiv.
+ // Undo this and sink the fneg so we match more fmsub/fnmadd patterns.
+ if (sd_match(N, m_FMul(m_Value(X), m_OneUse(m_FNeg(m_Value(Y))))))
+ return DAG.getNode(ISD::FNEG, DL, VT,
+ DAG.getNode(ISD::FMUL, DL, VT, X, Y));
+
// fmul X, (copysign 1.0, Y) -> fsgnjx X, Y
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -20091,13 +20269,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0->getOperand(0));
if (!C || !C->getValueAPF().isExactlyValue(+1.0))
return SDValue();
- EVT VT = N->getValueType(0);
if (VT.isVector() || !isOperationLegal(ISD::FCOPYSIGN, VT))
return SDValue();
SDValue Sign = N0->getOperand(1);
if (Sign.getValueType() != VT)
return SDValue();
- return DAG.getNode(RISCVISD::FSGNJX, SDLoc(N), VT, N1, N0->getOperand(1));
+ return DAG.getNode(RISCVISD::FSGNJX, DL, VT, N1, N0->getOperand(1));
}
case ISD::FADD:
case ISD::UMAX:
@@ -20381,9 +20558,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
VT, DL, MGN->getChain(), BasePtr,
DAG.getSignedConstant(StepNumerator, DL, XLenVT), MGN->getMask(),
EVL, MGN->getMemOperand());
- SDValue VPSelect = DAG.getNode(ISD::VP_SELECT, DL, VT, MGN->getMask(),
- StridedLoad, MGN->getPassThru(), EVL);
- return DAG.getMergeValues({VPSelect, SDValue(StridedLoad.getNode(), 1)},
+ SDValue Select = DAG.getSelect(DL, VT, MGN->getMask(), StridedLoad,
+ MGN->getPassThru());
+ return DAG.getMergeValues({Select, SDValue(StridedLoad.getNode(), 1)},
DL);
}
}
@@ -21060,6 +21237,38 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return N->getOperand(0);
break;
}
+ case RISCVISD::VSLIDE1UP_VL:
+ case RISCVISD::VFSLIDE1UP_VL: {
+ using namespace SDPatternMatch;
+ SDValue SrcVec;
+ SDLoc DL(N);
+ MVT VT = N->getSimpleValueType(0);
+ // If the scalar we're sliding in was extracted from the first element of a
+ // vector, we can use that vector as the passthru in a normal slideup of 1.
+ // This saves us an extract_element instruction (i.e. vfmv.f.s, vmv.x.s).
+ if (!N->getOperand(0).isUndef() ||
+ !sd_match(N->getOperand(2),
+ m_AnyOf(m_ExtractElt(m_Value(SrcVec), m_Zero()),
+ m_Node(RISCVISD::VMV_X_S, m_Value(SrcVec)))))
+ break;
+
+ MVT SrcVecVT = SrcVec.getSimpleValueType();
+ if (SrcVecVT.getVectorElementType() != VT.getVectorElementType())
+ break;
+ // Adapt the value type of source vector.
+ if (SrcVecVT.isFixedLengthVector()) {
+ SrcVecVT = getContainerForFixedLengthVector(SrcVecVT);
+ SrcVec = convertToScalableVector(SrcVecVT, SrcVec, DAG, Subtarget);
+ }
+ if (SrcVecVT.getVectorMinNumElements() < VT.getVectorMinNumElements())
+ SrcVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(VT), SrcVec, 0);
+ else
+ SrcVec = DAG.getExtractSubvector(DL, VT, SrcVec, 0);
+
+ return getVSlideup(DAG, Subtarget, DL, VT, SrcVec, N->getOperand(1),
+ DAG.getConstant(1, DL, XLenVT), N->getOperand(3),
+ N->getOperand(4));
+ }
}
return SDValue();
@@ -21120,9 +21329,14 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
- // Bail if we might break a sh{1,2,3}add pattern.
- if ((Subtarget.hasStdExtZba() || Subtarget.hasVendorXAndesPerf()) && C2 &&
- C2->getZExtValue() >= 1 && C2->getZExtValue() <= 3 && N->hasOneUse() &&
+ bool IsShXAdd =
+ (Subtarget.hasStdExtZba() || Subtarget.hasVendorXAndesPerf()) && C2 &&
+ C2->getZExtValue() >= 1 && C2->getZExtValue() <= 3;
+ bool IsQCShlAdd = Subtarget.hasVendorXqciac() && C2 &&
+ C2->getZExtValue() >= 4 && C2->getZExtValue() <= 31;
+
+ // Bail if we might break a sh{1,2,3}add/qc.shladd pattern.
+ if ((IsShXAdd || IsQCShlAdd) && N->hasOneUse() &&
N->user_begin()->getOpcode() == ISD::ADD &&
!isUsedByLdSt(*N->user_begin(), nullptr) &&
!isa<ConstantSDNode>(N->user_begin()->getOperand(1)))
@@ -21346,6 +21560,24 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = Known.sext(BitWidth);
break;
}
+ case RISCVISD::SRLW: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known = KnownBits::lshr(Known.trunc(32), Known2.trunc(5).zext(32));
+ // Restore the original width by sign extending.
+ Known = Known.sext(BitWidth);
+ break;
+ }
+ case RISCVISD::SRAW: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known = KnownBits::ashr(Known.trunc(32), Known2.trunc(5).zext(32));
+ // Restore the original width by sign extending.
+ Known = Known.sext(BitWidth);
+ break;
+ }
case RISCVISD::CTZW: {
KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
unsigned PossibleTZ = Known2.trunc(32).countMaxTrailingZeros();
@@ -21451,8 +21683,16 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
if (Tmp < 33) return 1;
return 33;
}
+ case RISCVISD::SRAW: {
+ unsigned Tmp =
+ DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ // sraw produces at least 33 sign bits. If the input already has more than
+ // 33 sign bits sraw, will preserve them.
+ // TODO: A more precise answer could be calculated depending on known bits
+ // in the shift amount.
+ return std::max(Tmp, 33U);
+ }
case RISCVISD::SLLW:
- case RISCVISD::SRAW:
case RISCVISD::SRLW:
case RISCVISD::DIVW:
case RISCVISD::DIVUW:
@@ -21463,9 +21703,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
case RISCVISD::FCVT_WU_RV64:
case RISCVISD::STRICT_FCVT_W_RV64:
case RISCVISD::STRICT_FCVT_WU_RV64:
- // TODO: As the result is sign-extended, this is conservatively correct. A
- // more precise answer could be calculated for SRAW depending on known
- // bits in the shift amount.
+ // TODO: As the result is sign-extended, this is conservatively correct.
return 33;
case RISCVISD::VMV_X_S: {
// The number of sign bits of the scalar result is computed by obtaining the
@@ -21548,6 +21786,14 @@ bool RISCVTargetLowering::canCreateUndefOrPoisonForTargetNode(
// TODO: Add more target nodes.
switch (Op.getOpcode()) {
+ case RISCVISD::SLLW:
+ case RISCVISD::SRAW:
+ case RISCVISD::SRLW:
+ case RISCVISD::RORW:
+ case RISCVISD::ROLW:
+ // Only the lower 5 bits of RHS are read, guaranteeing the rotate/shift
+ // amount is bounds.
+ return false;
case RISCVISD::SELECT_CC:
// Integer comparisons cannot create poison.
assert(Op.getOperand(0).getValueType().isInteger() &&
@@ -24683,7 +24929,7 @@ RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
bool RISCVTargetLowering::shouldFoldSelectWithSingleBitTest(
EVT VT, const APInt &AndMask) const {
- if (Subtarget.hasStdExtZicond() || Subtarget.hasVendorXVentanaCondOps())
+ if (Subtarget.hasCZEROLike())
return !Subtarget.hasStdExtZbs() && AndMask.ugt(1024);
return TargetLowering::shouldFoldSelectWithSingleBitTest(VT, AndMask);
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index fb63ebcfaace..4581c11356af 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -435,8 +435,8 @@ public:
const APInt &GapMask) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
- ShuffleVectorInst *SVI,
- unsigned Factor) const override;
+ ShuffleVectorInst *SVI, unsigned Factor,
+ const APInt &GapMask) const override;
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
IntrinsicInst *DI) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp
index 43621b8f0f33..9664ab345dcb 100644
--- a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp
+++ b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// The pass adds LPAD (AUIPC with rs1 = X0) machine instructions at the
+// The pass adds LPAD (AUIPC with rd = X0) machine instructions at the
// beginning of each basic block or function that is referenced by an indirect
// jump/call instruction.
//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
index 209c3fae63f4..4c7cd05723ac 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
@@ -54,7 +54,6 @@ class RVInst16CSS<bits<3> funct3, bits<2> opcode, dag outs, dag ins,
: RVInst16<outs, ins, opcodestr, argstr, [], InstFormatCSS> {
bits<10> imm;
bits<5> rs2;
- bits<5> rs1;
let Inst{15-13} = funct3;
let Inst{12-7} = imm{5-0};
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7b4a1de16769..d0bb57a3eaa1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -80,8 +80,8 @@ namespace llvm::RISCV {
} // end namespace llvm::RISCV
-RISCVInstrInfo::RISCVInstrInfo(RISCVSubtarget &STI)
- : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
+RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI)
+ : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
STI(STI) {}
#define GET_INSTRINFO_HELPERS
@@ -3511,6 +3511,9 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
return outliner::InstrType::Illegal;
}
+ if (isLPAD(MI))
+ return outliner::InstrType::Illegal;
+
return outliner::InstrType::Legal;
}
@@ -4796,8 +4799,22 @@ unsigned RISCV::getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW) {
return Scaled;
}
-/// Given two VL operands, do we know that LHS <= RHS?
+static std::optional<int64_t> getEffectiveImm(const MachineOperand &MO) {
+ assert(MO.isImm() || MO.getReg().isVirtual());
+ if (MO.isImm())
+ return MO.getImm();
+ const MachineInstr *Def =
+ MO.getParent()->getMF()->getRegInfo().getVRegDef(MO.getReg());
+ int64_t Imm;
+ if (isLoadImm(Def, Imm))
+ return Imm;
+ return std::nullopt;
+}
+
+/// Given two VL operands, do we know that LHS <= RHS? Must be used in SSA form.
bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
+ assert((LHS.isImm() || LHS.getParent()->getMF()->getRegInfo().isSSA()) &&
+ (RHS.isImm() || RHS.getParent()->getMF()->getRegInfo().isSSA()));
if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() &&
LHS.getReg() == RHS.getReg())
return true;
@@ -4807,9 +4824,11 @@ bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
return true;
if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
return false;
- if (!LHS.isImm() || !RHS.isImm())
+ std::optional<int64_t> LHSImm = getEffectiveImm(LHS),
+ RHSImm = getEffectiveImm(RHS);
+ if (!LHSImm || !RHSImm)
return false;
- return LHS.getImm() <= RHS.getImm();
+ return LHSImm <= RHSImm;
}
namespace {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 785c8352d4a5..57ec431749eb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -62,7 +62,7 @@ enum RISCVMachineCombinerPattern : unsigned {
class RISCVInstrInfo : public RISCVGenInstrInfo {
public:
- explicit RISCVInstrInfo(RISCVSubtarget &STI);
+ explicit RISCVInstrInfo(const RISCVSubtarget &STI);
MCInst getNop() const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 23f5a848137c..92552b36aa0b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1698,8 +1698,6 @@ let Predicates = [IsRV32] in {
def : Pat<(i32 (setlt (i32 GPR:$rs1), 0)), (SRLI GPR:$rs1, 31)>; // compressible
}
let Predicates = [IsRV64] in {
-def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x0000000080000000)), 0)),
- (XORI (i64 (SRLIW GPR:$rs1, 31)), 1)>;
def : Pat<(i64 (setlt (i64 GPR:$rs1), 0)), (SRLI GPR:$rs1, 63)>; // compressible
def : Pat<(i64 (setlt (sext_inreg GPR:$rs1, i32), 0)), (SRLIW GPR:$rs1, 31)>;
}
@@ -2330,7 +2328,6 @@ include "RISCVInstrInfoZalasr.td"
include "RISCVInstrInfoZimop.td"
include "RISCVInstrInfoZicbo.td"
include "RISCVInstrInfoZicond.td"
-include "RISCVInstrInfoZicfiss.td"
include "RISCVInstrInfoZilsd.td"
// Scalar FP
@@ -2359,6 +2356,9 @@ include "RISCVInstrInfoZc.td"
include "RISCVInstrInfoZcmop.td"
include "RISCVInstrInfoZclsd.td"
+// Control Flow Integriy, this requires Zimop/Zcmop
+include "RISCVInstrInfoZicfiss.td"
+
// Short Forward Branch
include "RISCVInstrInfoSFB.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index c5551fbdec28..9fc73662d970 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -230,13 +230,17 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
class CStackLoad<bits<3> funct3, string OpcodeStr,
DAGOperand cls, DAGOperand opnd>
: RVInst16CI<funct3, 0b10, (outs cls:$rd), (ins SPMem:$rs1, opnd:$imm),
- OpcodeStr, "$rd, ${imm}(${rs1})">;
+ OpcodeStr, "$rd, ${imm}(${rs1})"> {
+ bits<0> rs1;
+}
let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
class CStackStore<bits<3> funct3, string OpcodeStr,
DAGOperand cls, DAGOperand opnd>
: RVInst16CSS<funct3, 0b10, (outs), (ins cls:$rs2, SPMem:$rs1, opnd:$imm),
- OpcodeStr, "$rs2, ${imm}(${rs1})">;
+ OpcodeStr, "$rs2, ${imm}(${rs1})"> {
+ bits<0> rs1;
+}
let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
class CLoad_ri<bits<3> funct3, string OpcodeStr,
@@ -301,14 +305,6 @@ def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd),
let Inst{5} = imm{3};
}
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
-def C_FLD : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>,
- Sched<[WriteFLD64, ReadFMemBase]> {
- bits<8> imm;
- let Inst{12-10} = imm{5-3};
- let Inst{6-5} = imm{7-6};
-}
-
def C_LW : CLoad_ri<0b010, "c.lw", GPRC, uimm7_lsb00>,
Sched<[WriteLDW, ReadMemBase]> {
bits<7> imm;
@@ -326,16 +322,6 @@ def C_LW_INX : CLoad_ri<0b010, "c.lw", GPRF32C, uimm7_lsb00>,
let Inst{5} = imm{6};
}
-let DecoderNamespace = "RV32Only",
- Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in
-def C_FLW : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>,
- Sched<[WriteFLD32, ReadFMemBase]> {
- bits<7> imm;
- let Inst{12-10} = imm{5-3};
- let Inst{6} = imm{2};
- let Inst{5} = imm{6};
-}
-
let Predicates = [HasStdExtZca, IsRV64] in
def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>,
Sched<[WriteLDD, ReadMemBase]> {
@@ -344,14 +330,6 @@ def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>,
let Inst{6-5} = imm{7-6};
}
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
-def C_FSD : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>,
- Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> {
- bits<8> imm;
- let Inst{12-10} = imm{5-3};
- let Inst{6-5} = imm{7-6};
-}
-
def C_SW : CStore_rri<0b110, "c.sw", GPRC, uimm7_lsb00>,
Sched<[WriteSTW, ReadStoreData, ReadMemBase]> {
bits<7> imm;
@@ -369,16 +347,6 @@ def C_SW_INX : CStore_rri<0b110, "c.sw", GPRF32C, uimm7_lsb00>,
let Inst{5} = imm{6};
}
-let DecoderNamespace = "RV32Only",
- Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in
-def C_FSW : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>,
- Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> {
- bits<7> imm;
- let Inst{12-10} = imm{5-3};
- let Inst{6} = imm{2};
- let Inst{5} = imm{6};
-}
-
let Predicates = [HasStdExtZca, IsRV64] in
def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000>,
Sched<[WriteSTD, ReadStoreData, ReadMemBase]> {
@@ -500,12 +468,6 @@ def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb),
let Constraints = "$rd = $rd_wb";
}
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
-def C_FLDSP : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>,
- Sched<[WriteFLD64, ReadFMemBase]> {
- let Inst{4-2} = imm{8-6};
-}
-
def C_LWSP : CStackLoad<0b010, "c.lwsp", GPRNoX0, uimm8_lsb00>,
Sched<[WriteLDW, ReadMemBase]> {
let Inst{3-2} = imm{7-6};
@@ -517,13 +479,6 @@ def C_LWSP_INX : CStackLoad<0b010, "c.lwsp", GPRF32NoX0, uimm8_lsb00>,
let Inst{3-2} = imm{7-6};
}
-let DecoderNamespace = "RV32Only",
- Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in
-def C_FLWSP : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>,
- Sched<[WriteFLD32, ReadFMemBase]> {
- let Inst{3-2} = imm{7-6};
-}
-
let Predicates = [HasStdExtZca, IsRV64] in
def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000>,
Sched<[WriteLDD, ReadMemBase]> {
@@ -560,12 +515,6 @@ def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPR:$rd),
let Constraints = "$rs1 = $rd";
}
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
-def C_FSDSP : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>,
- Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> {
- let Inst{9-7} = imm{8-6};
-}
-
def C_SWSP : CStackStore<0b110, "c.swsp", GPR, uimm8_lsb00>,
Sched<[WriteSTW, ReadStoreData, ReadMemBase]> {
let Inst{8-7} = imm{7-6};
@@ -577,13 +526,6 @@ def C_SWSP_INX : CStackStore<0b110, "c.swsp", GPRF32, uimm8_lsb00>,
let Inst{8-7} = imm{7-6};
}
-let DecoderNamespace = "RV32Only",
- Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in
-def C_FSWSP : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>,
- Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> {
- let Inst{8-7} = imm{7-6};
-}
-
let Predicates = [HasStdExtZca, IsRV64] in
def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000>,
Sched<[WriteSTD, ReadStoreData, ReadMemBase]> {
@@ -600,6 +542,61 @@ def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther>,
} // Predicates = [HasStdExtZca]
+let DecoderNamespace = "RV32Only",
+ Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
+ def C_FLW : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>,
+ Sched<[WriteFLD32, ReadFMemBase]> {
+ bits<7> imm;
+ let Inst{12-10} = imm{5-3};
+ let Inst{6} = imm{2};
+ let Inst{5} = imm{6};
+ }
+
+ def C_FSW : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>,
+ Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> {
+ bits<7> imm;
+ let Inst{12-10} = imm{5-3};
+ let Inst{6} = imm{2};
+ let Inst{5} = imm{6};
+ }
+
+ def C_FLWSP : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>,
+ Sched<[WriteFLD32, ReadFMemBase]> {
+ let Inst{3-2} = imm{7-6};
+ }
+
+ def C_FSWSP : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>,
+ Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> {
+ let Inst{8-7} = imm{7-6};
+ }
+} // DecoderNamespace = "RV32Only", Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
+ def C_FLD : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>,
+ Sched<[WriteFLD64, ReadFMemBase]> {
+ bits<8> imm;
+ let Inst{12-10} = imm{5-3};
+ let Inst{6-5} = imm{7-6};
+ }
+
+ def C_FSD : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>,
+ Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> {
+ bits<8> imm;
+ let Inst{12-10} = imm{5-3};
+ let Inst{6-5} = imm{7-6};
+ }
+
+ def C_FLDSP : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>,
+ Sched<[WriteFLD64, ReadFMemBase]> {
+ let Inst{4-2} = imm{8-6};
+ }
+
+ def C_FSDSP : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>,
+ Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> {
+ let Inst{9-7} = imm{8-6};
+ }
+} // Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
+
//===----------------------------------------------------------------------===//
// HINT Instructions
//===----------------------------------------------------------------------===//
@@ -767,20 +764,17 @@ def : InstAlias<".insn_cj $opcode, $funct3, $imm11",
// Compress Instruction tablegen backend.
//===----------------------------------------------------------------------===//
-// Patterns are defined in the same order the compressed instructions appear
+// Zca patterns are defined in the same order the compressed instructions appear
// under the "RVC Instruction Set Listings" section of the ISA manual.
+// Zca Instructions
+
// Quadrant 0
let Predicates = [HasStdExtZca] in {
def : CompressPat<(ADDI GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm),
(C_ADDI4SPN GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm)>;
} // Predicates = [HasStdExtZca]
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
-def : CompressPat<(FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm),
- (C_FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
-} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
-
let Predicates = [HasStdExtZca] in {
def : CompressPat<(LW GPRC:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
(C_LW GPRC:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
@@ -790,21 +784,11 @@ def : CompressPat<(LW_INX GPRF32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
(C_LW_INX GPRF32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
} // Predicates = [HasStdExtZca]
-let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
-def : CompressPat<(FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
- (C_FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
-} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
-
let Predicates = [HasStdExtZca, IsRV64] in {
def : CompressPat<(LD GPRC:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm),
(C_LD GPRC:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
} // Predicates = [HasStdExtZca, IsRV64]
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
-def : CompressPat<(FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm),
- (C_FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
-} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
-
let Predicates = [HasStdExtZca] in {
def : CompressPat<(SW GPRC:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
(C_SW GPRC:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
@@ -814,11 +798,6 @@ def : CompressPat<(SW_INX GPRF32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
(C_SW_INX GPRF32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
} // Predicates = [HasStdExtZca]
-let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
-def : CompressPat<(FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
- (C_FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
-} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
-
let Predicates = [HasStdExtZca, IsRV64] in {
def : CompressPat<(SD GPRC:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm),
(C_SD GPRC:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
@@ -907,11 +886,6 @@ def : CompressPat<(SLLI GPRNoX0:$rs1, GPRNoX0:$rs1, uimmlog2xlennonzero:$imm),
(C_SLLI GPRNoX0:$rs1, uimmlog2xlennonzero:$imm)>;
} // Predicates = [HasStdExtZca]
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
-def : CompressPat<(FLD FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm),
- (C_FLDSP FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm)>;
-} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
-
let Predicates = [HasStdExtZca] in {
def : CompressPat<(LW GPRNoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm),
(C_LWSP GPRNoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>;
@@ -921,11 +895,6 @@ def : CompressPat<(LW_INX GPRF32NoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm),
(C_LWSP_INX GPRF32NoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>;
} // Predicates = [HasStdExtZca]
-let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
-def : CompressPat<(FLW FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm),
- (C_FLWSP FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>;
-} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
-
let Predicates = [HasStdExtZca, IsRV64] in {
def : CompressPat<(LD GPRNoX0:$rd, SPMem:$rs1, uimm9_lsb000:$imm),
(C_LDSP GPRNoX0:$rd, SPMem:$rs1, uimm9_lsb000:$imm)>;
@@ -953,11 +922,6 @@ def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs1),
(C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
} // Predicates = [HasStdExtZca]
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
-def : CompressPat<(FSD FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm),
- (C_FSDSP FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>;
-} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
-
let Predicates = [HasStdExtZca] in {
def : CompressPat<(SW GPR:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
(C_SWSP GPR:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>;
@@ -967,12 +931,38 @@ def : CompressPat<(SW_INX GPRF32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
(C_SWSP_INX GPRF32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>;
} // Predicates = [HasStdExtZca]
-let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
-def : CompressPat<(FSW FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
- (C_FSWSP FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>;
-} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
-
let Predicates = [HasStdExtZca, IsRV64] in {
def : CompressPat<(SD GPR:$rs2, SPMem:$rs1, uimm9_lsb000:$imm),
(C_SDSP GPR:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>;
} // Predicates = [HasStdExtZca, IsRV64]
+
+// Zcf Instructions
+let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
+ // Quadrant 0
+ def : CompressPat<(FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
+ (C_FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
+ def : CompressPat<(FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
+ (C_FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
+
+ // Quadrant 2
+ def : CompressPat<(FLW FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm),
+ (C_FLWSP FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>;
+ def : CompressPat<(FSW FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
+ (C_FSWSP FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
+
+// Zcd Instructions
+let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
+ // Quadrant 0
+ def : CompressPat<(FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm),
+ (C_FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
+ def : CompressPat<(FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm),
+ (C_FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
+
+ // Quadrant 2
+ def : CompressPat<(FLD FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm),
+ (C_FLDSP FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm)>;
+ def : CompressPat<(FSD FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm),
+ (C_FSDSP FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>;
+} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
+
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index c342b41e41d0..6840dacaea54 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -25,7 +25,7 @@ def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> {
}
// A 8-bit signed immediate allowing range [-128, 255]
-// but represented as [-128, 255].
+// but represented as [-128, 127].
def simm8_unsigned : RISCVOp {
let ParserMatchClass = SImm8UnsignedAsmOperand;
let EncoderMethod = "getImmOpValue";
@@ -98,6 +98,40 @@ class PLUI_i<bits<7> funct7, string opcodestr>
let Inst{23-15} = imm10{9-1};
}
+// Common base for widening Binary/Ternary ops
+class RVPWideningBase<bits<2> w, bit arith_shift, dag outs, dag ins,
+ string opcodestr>
+ : RVInst<outs, ins, opcodestr, "$rd, $rs1, $rs2", [], InstFormatOther> {
+ bits<5> rs2;
+ bits<5> rs1;
+ bits<5> rd;
+
+ let Inst{31} = 0b0;
+ let Inst{26-25} = w;
+ let Inst{24-20} = rs2;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = 0b010;
+ let Inst{11-8} = rd{4-1};
+ let Inst{7} = arith_shift;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
+}
+
+// Common base for narrowing ops
+class RVPNarrowingBase<bits<3> f, bit r, bits<4> funct4, dag outs, dag ins,
+ string opcodestr, string argstr>
+ : RVInst<outs, ins, opcodestr, argstr, [], InstFormatOther> {
+ bits<5> rs1;
+ bits<5> rd;
+
+ let Inst{31} = 0b0;
+ let Inst{30-28} = f;
+ let Inst{27} = r;
+ let Inst{19-16} = rs1{4-1};
+ let Inst{15-12} = funct4;
+ let Inst{11-7} = rd;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
+}
+
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
: RVInstIBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
@@ -141,6 +175,100 @@ class RVPShiftB_ri<bits<3> f, bits<3> funct3, string opcodestr>
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPWideningShift_ri<bits<3> f, string opcodestr, Operand ImmType>
+ : RVInst<(outs GPRPairRV32:$rd), (ins GPR:$rs1, ImmType:$shamt), opcodestr,
+ "$rd, $rs1, $shamt", [], InstFormatOther> {
+ bits<5> rs1;
+ bits<5> rd;
+
+ let Inst{31} = 0b0;
+ let Inst{30-28} = f;
+ let Inst{27} = 0b0;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = 0b010;
+ let Inst{11-8} = rd{4-1};
+ let Inst{7} = 0b0;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
+
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+class RVPWideningShiftW_ri<bits<3> f, string opcodestr>
+ : RVPWideningShift_ri<f, opcodestr, uimm6> {
+ bits<6> shamt;
+
+ let Inst{26} = 0b1;
+ let Inst{25-20} = shamt;
+}
+
+class RVPWideningShiftH_ri<bits<3> f, string opcodestr>
+ : RVPWideningShift_ri<f, opcodestr, uimm5> {
+ bits<5> shamt;
+
+ let Inst{26-25} = 0b01;
+ let Inst{24-20} = shamt;
+}
+
+class RVPWideningShiftB_ri<bits<3> f, string opcodestr>
+ : RVPWideningShift_ri<f, opcodestr, uimm4> {
+ bits<4> shamt;
+
+ let Inst{26-24} = 0b001;
+ let Inst{23-20} = shamt;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPNarrowingShift_ri<bits<3> f, string opcodestr, Operand ImmType>
+ : RVPNarrowingBase<f, 0b0, 0b1100, (outs GPR:$rd),
+ (ins GPRPairRV32:$rs1, ImmType:$shamt), opcodestr,
+ "$rd, $rs1, $shamt">;
+
+class RVPNarrowingShiftW_ri<bits<3> f, string opcodestr>
+ : RVPNarrowingShift_ri<f, opcodestr, uimm6> {
+ bits<6> shamt;
+
+ let Inst{26} = 0b1;
+ let Inst{25-20} = shamt;
+}
+
+class RVPNarrowingShiftH_ri<bits<3> f, string opcodestr>
+ : RVPNarrowingShift_ri<f, opcodestr, uimm5> {
+ bits<5> shamt;
+
+ let Inst{26-25} = 0b01;
+ let Inst{24-20} = shamt;
+}
+
+class RVPNarrowingShiftB_ri<bits<3> f, string opcodestr>
+ : RVPNarrowingShift_ri<f, opcodestr, uimm4> {
+ bits<4> shamt;
+
+ let Inst{26-24} = 0b001;
+ let Inst{23-20} = shamt;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPNarrowingShift_rr<bits<3> f, bits<2> w, string opcodestr>
+ : RVPNarrowingBase<f, 0b1, 0b1100, (outs GPR:$rd),
+ (ins GPRPairRV32:$rs1, GPR:$rs2), opcodestr,
+ "$rd, $rs1, $rs2"> {
+ bits<5> rs2;
+
+ let Inst{26-25} = w;
+ let Inst{24-20} = rs2;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPWideningShift_rr<bits<3> f, bits<2> w, string opcodestr>
+ : RVPWideningBase<w, 0b0, (outs GPRPairRV32:$rd), (ins GPR:$rs1, GPR:$rs2),
+ opcodestr> {
+ let Inst{30-28} = f;
+ let Inst{27} = 0b1;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
: RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins GPR:$rs1),
opcodestr, "$rd, $rs1"> {
@@ -169,6 +297,24 @@ class RVPBinary_rr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPWideningBinary_rr<bits<4> f, bits<2> w, string opcodestr>
+ : RVPWideningBase<w, 0b1, (outs GPRPairRV32:$rd), (ins GPR:$rs1, GPR:$rs2),
+ opcodestr> {
+ let Inst{30-27} = f;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPNarrowingBinary_rr<bits<3> f, bits<2> w, string opcodestr>
+ : RVPNarrowingBase<f, 0b1, 0b0100, (outs GPR:$rd),
+ (ins GPRPairRV32:$rs1, GPR:$rs2), opcodestr,
+ "$rd, $rs1, $rs2"> {
+ bits<5> rs2;
+
+ let Inst{26-25} = w;
+ let Inst{24-20} = rs2;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
: RVInstRBase<funct3, OPC_OP_32, (outs GPR:$rd_wb),
(ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr,
@@ -180,6 +326,15 @@ class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
let Constraints = "$rd = $rd_wb";
}
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPWideningTernary_rrr<bits<4> f, bits<2> w, string opcodestr>
+ : RVPWideningBase<w, 0b1, (outs GPRPairRV32:$rd_wb),
+ (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr> {
+ let Inst{30-27} = f;
+
+ let Constraints = "$rd = $rd_wb";
+}
+
// Common base for pli.db/h/w and plui.dh/w
class RVPPairLoadImm_i<bits<7> funct7, dag ins, string opcodestr,
string argstr>
@@ -889,3 +1044,156 @@ let Predicates = [HasStdExtP, IsRV32] in {
let Inst{23-15} = imm10{9-1};
}
}
+
+let Predicates = [HasStdExtP, IsRV32] in {
+ def PWSLLI_B : RVPWideningShiftB_ri<0b000, "pwslli.b">;
+ def PWSLLI_H : RVPWideningShiftH_ri<0b000, "pwslli.h">;
+ def WSLLI : RVPWideningShiftW_ri<0b000, "wslli">;
+
+ def PWSLAI_B : RVPWideningShiftB_ri<0b100, "pwslai.b">;
+ def PWSLAI_H : RVPWideningShiftH_ri<0b100, "pwslai.h">;
+ def WSLAI : RVPWideningShiftW_ri<0b100, "wslai">;
+
+ def PWSLL_BS : RVPWideningShift_rr<0b000, 0b00, "pwsll.bs">;
+ def PWSLL_HS : RVPWideningShift_rr<0b000, 0b01, "pwsll.hs">;
+ def WSLL : RVPWideningShift_rr<0b000, 0b11, "wsll">;
+
+ def PWSLA_BS : RVPWideningShift_rr<0b100, 0b00, "pwsla.bs">;
+ def PWSLA_HS : RVPWideningShift_rr<0b100, 0b01, "pwsla.hs">;
+ def WSLA : RVPWideningShift_rr<0b100, 0b11, "wsla">;
+
+ def WZIP8P : RVPWideningShift_rr<0b111, 0b00, "wzip8p">;
+ def WZIP16P : RVPWideningShift_rr<0b111, 0b01, "wzip16p">;
+
+ def PWADD_H : RVPWideningBinary_rr<0b0000, 0b00, "pwadd.h">;
+ def WADD : RVPWideningBinary_rr<0b0000, 0b01, "wadd">;
+ def PWADD_B : RVPWideningBinary_rr<0b0000, 0b10, "pwadd.b">;
+ def PM2WADD_H : RVPWideningBinary_rr<0b0000, 0b11, "pm2wadd.h">;
+
+ def PWADDA_H : RVPWideningTernary_rrr<0b0001, 0b00, "pwadda.h">;
+ def WADDA : RVPWideningTernary_rrr<0b0001, 0b01, "wadda">;
+ def PWADDA_B : RVPWideningTernary_rrr<0b0001, 0b10, "pwadda.b">;
+ def PM2WADDA_H : RVPWideningTernary_rrr<0b0001, 0b11, "pm2wadda.h">;
+
+ def PWADDU_H : RVPWideningBinary_rr<0b0010, 0b00, "pwaddu.h">;
+ def WADDU : RVPWideningBinary_rr<0b0010, 0b01, "waddu">;
+ def PWADDU_B : RVPWideningBinary_rr<0b0010, 0b10, "pwaddu.b">;
+ def PM2WADD_HX : RVPWideningBinary_rr<0b0010, 0b11, "pm2wadd.hx">;
+
+ def PWADDAU_H : RVPWideningTernary_rrr<0b0011, 0b00, "pwaddau.h">;
+ def WADDAU : RVPWideningTernary_rrr<0b0011, 0b01, "waddau">;
+ def PWADDAU_B : RVPWideningTernary_rrr<0b0011, 0b10, "pwaddau.b">;
+ def PM2WADDA_HX : RVPWideningTernary_rrr<0b0011, 0b11, "pm2wadda.hx">;
+
+ def PWMUL_H : RVPWideningBinary_rr<0b0100, 0b00, "pwmul.h">;
+ def WMUL : RVPWideningBinary_rr<0b0100, 0b01, "wmul">;
+ def PWMUL_B : RVPWideningBinary_rr<0b0100, 0b10, "pwmul.b">;
+ def PM2WADDU_H : RVPWideningBinary_rr<0b0100, 0b11, "pm2waddu.h">;
+
+ def PWMACC_H : RVPWideningTernary_rrr<0b0101, 0b00, "pwmacc.h">;
+ def WMACC : RVPWideningTernary_rrr<0b0101, 0b01, "wmacc">;
+ def PM2WADDAU_H : RVPWideningTernary_rrr<0b0101, 0b11, "pm2waddau.h">;
+
+ def PWMULU_H : RVPWideningBinary_rr<0b0110, 0b00, "pwmulu.h">;
+ def WMULU : RVPWideningBinary_rr<0b0110, 0b01, "wmulu">;
+ def PWMULU_B : RVPWideningBinary_rr<0b0110, 0b10, "pwmulu.b">;
+
+ def PWMACCU_H : RVPWideningTernary_rrr<0b0111, 0b00, "pwmaccu.h">;
+ def WMACCU : RVPWideningTernary_rrr<0b0111, 0b01, "wmaccu">;
+
+ def PWSUB_H : RVPWideningBinary_rr<0b1000, 0b00, "pwsub.h">;
+ def WSUB : RVPWideningBinary_rr<0b1000, 0b01, "wsub">;
+ def PWSUB_B : RVPWideningBinary_rr<0b1000, 0b10, "pwsub.b">;
+ def PM2WSUB_H : RVPWideningBinary_rr<0b1000, 0b11, "pm2wsub.h">;
+
+ def PWSUBA_H : RVPWideningTernary_rrr<0b1001, 0b00, "pwsuba.h">;
+ def WSUBA : RVPWideningTernary_rrr<0b1001, 0b01, "wsuba">;
+ def PWSUBA_B : RVPWideningTernary_rrr<0b1001, 0b10, "pwsuba.b">;
+ def PM2WSUBA_H : RVPWideningTernary_rrr<0b1001, 0b11, "pm2wsuba.h">;
+
+ def PWSUBU_H : RVPWideningBinary_rr<0b1010, 0b00, "pwsubu.h">;
+ def WSUBU : RVPWideningBinary_rr<0b1010, 0b01, "wsubu">;
+ def PWSUBU_B : RVPWideningBinary_rr<0b1010, 0b10, "pwsubu.b">;
+ def PM2WSUB_HX : RVPWideningBinary_rr<0b1010, 0b11, "pm2wsub.hx">;
+
+ def PWSUBAU_H : RVPWideningTernary_rrr<0b1011, 0b00, "pwsubau.h">;
+ def WSUBAU : RVPWideningTernary_rrr<0b1011, 0b01, "wsubau">;
+ def PWSUBAU_B : RVPWideningTernary_rrr<0b1011, 0b10, "pwsubau.b">;
+ def PM2WSUBA_HX : RVPWideningTernary_rrr<0b1011, 0b11, "pm2wsuba.hx">;
+
+ def PWMULSU_H : RVPWideningBinary_rr<0b1100, 0b00, "pwmulsu.h">;
+ def WMULSU : RVPWideningBinary_rr<0b1100, 0b01, "wmulsu">;
+ def PWMULSU_B : RVPWideningBinary_rr<0b1100, 0b10, "pwmulsu.b">;
+ def PM2WADDSU_H : RVPWideningBinary_rr<0b1100, 0b11, "pm2waddsu.h">;
+
+ def PWMACCSU_H : RVPWideningTernary_rrr<0b1101, 0b00, "pwmaccsu.h">;
+ def WMACCSU : RVPWideningTernary_rrr<0b1101, 0b01, "wmaccsu">;
+ def PM2WADDASU_H : RVPWideningTernary_rrr<0b1101, 0b11, "pm2waddasu.h">;
+
+ def PMQWACC_H : RVPWideningTernary_rrr<0b1111, 0b00, "pmqwacc.h">;
+ def PMQWACC : RVPWideningTernary_rrr<0b1111, 0b01, "pmqwacc">;
+ def PMQRWACC_H : RVPWideningTernary_rrr<0b1111, 0b10, "pmqrwacc.h">;
+ def PMQRWACC : RVPWideningTernary_rrr<0b1111, 0b11, "pmqrwacc">;
+
+ def PREDSUM_DHS : RVPNarrowingBinary_rr<0b001, 0b00, "predsum.dhs">;
+ def PREDSUM_DBS : RVPNarrowingBinary_rr<0b001, 0b10, "predsum.dbs">;
+
+ def PREDSUMU_DHS : RVPNarrowingBinary_rr<0b011, 0b00, "predsumu.dhs">;
+ def PREDSUMU_DBS : RVPNarrowingBinary_rr<0b011, 0b10, "predsumu.dbs">;
+
+ def PNSRLI_B : RVPNarrowingShiftB_ri<0b000, "pnsrli.b">;
+ def PNSRLI_H : RVPNarrowingShiftH_ri<0b000, "pnsrli.h">;
+ def NSRLI : RVPNarrowingShiftW_ri<0b000, "nsrli">;
+
+ def PNCLIPIU_B : RVPNarrowingShiftB_ri<0b010, "pnclipiu.b">;
+ def PNCLIPIU_H : RVPNarrowingShiftH_ri<0b010, "pnclipiu.h">;
+ def NCLIPIU : RVPNarrowingShiftW_ri<0b010, "nclipiu">;
+
+ def PNCLIPRIU_B : RVPNarrowingShiftB_ri<0b011, "pnclipriu.b">;
+ def PNCLIPRIU_H : RVPNarrowingShiftH_ri<0b011, "pnclipriu.h">;
+ def NCLIPRIU : RVPNarrowingShiftW_ri<0b011, "nclipriu">;
+
+ def PNSRAI_B : RVPNarrowingShiftB_ri<0b100, "pnsrai.b">;
+ def PNSRAI_H : RVPNarrowingShiftH_ri<0b100, "pnsrai.h">;
+ def NSRAI : RVPNarrowingShiftW_ri<0b100, "nsrai">;
+
+ def PNSARI_B : RVPNarrowingShiftB_ri<0b101, "pnsari.b">;
+ def PNSARI_H : RVPNarrowingShiftH_ri<0b101, "pnsari.h">;
+ def NSARI : RVPNarrowingShiftW_ri<0b101, "nsari">;
+
+ def PNCLIPI_B : RVPNarrowingShiftB_ri<0b110, "pnclipi.b">;
+ def PNCLIPI_H : RVPNarrowingShiftH_ri<0b110, "pnclipi.h">;
+ def NCLIPI : RVPNarrowingShiftW_ri<0b110, "nclipi">;
+
+ def PNCLIPRI_B : RVPNarrowingShiftB_ri<0b111, "pnclipri.b">;
+ def PNCLIPRI_H : RVPNarrowingShiftH_ri<0b111, "pnclipri.h">;
+ def NCLIPRI : RVPNarrowingShiftW_ri<0b111, "nclipri">;
+
+ def PNSRL_BS : RVPNarrowingShift_rr<0b000, 0b00, "pnsrl.bs">;
+ def PNSRL_HS : RVPNarrowingShift_rr<0b000, 0b01, "pnsrl.hs">;
+ def NSRL : RVPNarrowingShift_rr<0b000, 0b11, "nsrl">;
+
+ def PNCLIPU_BS : RVPNarrowingShift_rr<0b010, 0b00, "pnclipu.bs">;
+ def PNCLIPU_HS : RVPNarrowingShift_rr<0b010, 0b01, "pnclipu.hs">;
+ def NCLIPU : RVPNarrowingShift_rr<0b010, 0b11, "nclipu">;
+
+ def PNCLIPRU_BS : RVPNarrowingShift_rr<0b011, 0b00, "pnclipru.bs">;
+ def PNCLIPRU_HS : RVPNarrowingShift_rr<0b011, 0b01, "pnclipru.hs">;
+ def NCLIPRU : RVPNarrowingShift_rr<0b011, 0b11, "nclipru">;
+
+ def PNSRA_BS : RVPNarrowingShift_rr<0b100, 0b00, "pnsra.bs">;
+ def PNSRA_HS : RVPNarrowingShift_rr<0b100, 0b01, "pnsra.hs">;
+ def NSRA : RVPNarrowingShift_rr<0b100, 0b11, "nsra">;
+
+ def PNSRAR_BS : RVPNarrowingShift_rr<0b101, 0b00, "pnsrar.bs">;
+ def PNSRAR_HS : RVPNarrowingShift_rr<0b101, 0b01, "pnsrar.hs">;
+ def NSRAR : RVPNarrowingShift_rr<0b101, 0b11, "nsrar">;
+
+ def PNCLIP_BS : RVPNarrowingShift_rr<0b110, 0b00, "pnclip.bs">;
+ def PNCLIP_HS : RVPNarrowingShift_rr<0b110, 0b01, "pnclip.hs">;
+ def NCLIP : RVPNarrowingShift_rr<0b110, 0b11, "nclip">;
+
+ def PNCLIPR_BS : RVPNarrowingShift_rr<0b111, 0b00, "pnclipr.bs">;
+ def PNCLIPR_HS : RVPNarrowingShift_rr<0b111, 0b01, "pnclipr.hs">;
+ def NCLIPR : RVPNarrowingShift_rr<0b111, 0b11, "nclipr">;
+} // Predicates = [HasStdExtP, IsRV32]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index 32f533b8f114..f732ab13e5f8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -44,153 +44,95 @@ def PseudoCCMOVGPRNoX0 : Pseudo<(outs GPRNoX0:$dst),
Sched<[]>;
}
+class SFBALU_rr
+ : Pseudo<(outs GPR:$dst),
+ (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
+ GPR:$rs2), []>,
+ Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU,
+ ReadSFBALU]> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let Size = 8;
+ let Constraints = "$dst = $falsev";
+}
+
+class SFBALU_ri
+ : Pseudo<(outs GPR:$dst),
+ (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
+ simm12:$imm), []>,
+ Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let Size = 8;
+ let Constraints = "$dst = $falsev";
+}
+
+class SFBShift_ri
+ : Pseudo<(outs GPR:$dst),
+ (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
+ uimmlog2xlen:$imm), []>,
+ Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let Size = 8;
+ let Constraints = "$dst = $falsev";
+}
+
+class SFBShiftW_ri
+ : Pseudo<(outs GPR:$dst),
+ (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
+ uimm5:$imm), []>,
+ Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let Size = 8;
+ let Constraints = "$dst = $falsev";
+}
+
// Conditional binops, that updates update $dst to (op rs1, rs2) when condition
// is true. Returns $falsev otherwise. Selected by optimizeSelect.
// TODO: Can we use DefaultOperands on the regular binop to accomplish this more
// like how ARM does predication?
-let Predicates = [HasShortForwardBranchOpt], hasSideEffects = 0,
- mayLoad = 0, mayStore = 0, Size = 8, Constraints = "$dst = $falsev" in {
-def PseudoCCADD : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSUB : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSLL : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSRL : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSRA : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU, ReadSFBALU]>;
-def PseudoCCAND : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCOR : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCXOR : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
+let Predicates = [HasShortForwardBranchOpt] in {
+def PseudoCCADD : SFBALU_rr;
+def PseudoCCSUB : SFBALU_rr;
+def PseudoCCSLL : SFBALU_rr;
+def PseudoCCSRL : SFBALU_rr;
+def PseudoCCSRA : SFBALU_rr;
+def PseudoCCAND : SFBALU_rr;
+def PseudoCCOR : SFBALU_rr;
+def PseudoCCXOR : SFBALU_rr;
-def PseudoCCADDI : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
-def PseudoCCSLLI : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
-def PseudoCCSRLI : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
-def PseudoCCSRAI : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
-def PseudoCCANDI : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
-def PseudoCCORI : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
-def PseudoCCXORI : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
+def PseudoCCADDI : SFBALU_ri;
+def PseudoCCANDI : SFBALU_ri;
+def PseudoCCORI : SFBALU_ri;
+def PseudoCCXORI : SFBALU_ri;
+
+def PseudoCCSLLI : SFBShift_ri;
+def PseudoCCSRLI : SFBShift_ri;
+def PseudoCCSRAI : SFBShift_ri;
// RV64I instructions
-def PseudoCCADDW : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSUBW : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSLLW : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSRLW : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSRAW : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU, ReadSFBALU]>;
+def PseudoCCADDW : SFBALU_rr;
+def PseudoCCSUBW : SFBALU_rr;
+def PseudoCCSLLW : SFBALU_rr;
+def PseudoCCSRLW : SFBALU_rr;
+def PseudoCCSRAW : SFBALU_rr;
+
+def PseudoCCADDIW : SFBALU_ri;
-def PseudoCCADDIW : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
-def PseudoCCSLLIW : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
-def PseudoCCSRLIW : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
-def PseudoCCSRAIW : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
- ReadSFBALU]>;
+def PseudoCCSLLIW : SFBShiftW_ri;
+def PseudoCCSRLIW : SFBShiftW_ri;
+def PseudoCCSRAIW : SFBShiftW_ri;
// Zbb/Zbkb instructions
-def PseudoCCANDN : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCORN : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCXNOR : Pseudo<(outs GPR:$dst),
- (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
- GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
- Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
- ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
+def PseudoCCANDN : SFBALU_rr;
+def PseudoCCORN : SFBALU_rr;
+def PseudoCCXNOR : SFBALU_rr;
}
let Predicates = [HasShortForwardBranchOpt] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index acbccddce2b5..063ee5c5e8b9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -830,19 +830,6 @@ multiclass VPatTiedBinaryNoMaskVL_V<SDNode vop,
result_reg_class:$rs1,
op2_reg_class:$rs2,
GPR:$vl, sew, TAIL_AGNOSTIC)>;
- // Tail undisturbed
- def : Pat<(riscv_vmerge_vl true_mask,
- (result_type (vop
- result_reg_class:$rs1,
- (op2_type op2_reg_class:$rs2),
- srcvalue,
- true_mask,
- VLOpFrag)),
- result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag),
- (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_TIED")
- result_reg_class:$rs1,
- op2_reg_class:$rs2,
- GPR:$vl, sew, TU_MU)>;
}
class VPatTiedBinaryMaskVL_V<SDNode vop,
@@ -892,22 +879,6 @@ multiclass VPatTiedBinaryNoMaskVL_V_RM<SDNode vop,
// RISCVInsertReadWriteCSR
FRM_DYN,
GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
- // Tail undisturbed
- def : Pat<(riscv_vmerge_vl true_mask,
- (result_type (vop
- result_reg_class:$rs1,
- (op2_type op2_reg_class:$rs2),
- srcvalue,
- true_mask,
- VLOpFrag)),
- result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag),
- (!cast<Instruction>(name)
- result_reg_class:$rs1,
- op2_reg_class:$rs2,
- // Value to indicate no rounding mode change in
- // RISCVInsertReadWriteCSR
- FRM_DYN,
- GPR:$vl, log2sew, TU_MU)>;
}
class VPatBinaryVL_XI<SDPatternOperator vop,
@@ -1755,50 +1726,6 @@ multiclass VPatMultiplyAddVL_VV_VX<SDNode op, string instruction_name> {
}
}
-multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
- foreach vti = AllIntegerVectors in {
- defvar suffix = vti.LMul.MX;
- let Predicates = GetVTypePredicates<vti>.Predicates in {
- def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
- (vti.Vector (op vti.RegClass:$rd,
- (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
- srcvalue, (vti.Mask true_mask), VLOpFrag),
- srcvalue, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
- (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
- vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
- def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
- (vti.Vector (op vti.RegClass:$rd,
- (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
- srcvalue, (vti.Mask true_mask), VLOpFrag),
- srcvalue, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
- (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
- vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
- def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
- (vti.Vector (op vti.RegClass:$rd,
- (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
- srcvalue, (vti.Mask true_mask), VLOpFrag),
- srcvalue, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, undef, VLOpFrag),
- (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
- vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
- def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
- (vti.Vector (op vti.RegClass:$rd,
- (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
- srcvalue, (vti.Mask true_mask), VLOpFrag),
- srcvalue, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, undef, VLOpFrag),
- (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
- vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
- }
- }
-}
-
multiclass VPatWidenMultiplyAddVL_VV_VX<SDNode vwmacc_op, string instr_name> {
foreach vtiTowti = AllWidenableIntVectors in {
defvar vti = vtiTowti.Vti;
@@ -1898,82 +1825,6 @@ multiclass VPatFPMulAddVL_VV_VF_RM<SDPatternOperator vop, string instruction_nam
}
}
-multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
- foreach vti = AllFloatVectors in {
- defvar suffix = vti.LMul.MX # "_E" # vti.SEW;
- let Predicates = GetVTypePredicates<vti>.Predicates in {
- def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
- (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
- vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
- (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
- vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm),
- // Value to indicate no rounding mode change in
- // RISCVInsertReadWriteCSR
- FRM_DYN,
- GPR:$vl, vti.Log2SEW, TU_MU)>;
- def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
- (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
- vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
- (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
- vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm),
- // Value to indicate no rounding mode change in
- // RISCVInsertReadWriteCSR
- FRM_DYN,
- GPR:$vl, vti.Log2SEW, TU_MU)>;
- def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
- (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
- vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, undef, VLOpFrag),
- (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
- vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm),
- // Value to indicate no rounding mode change in
- // RISCVInsertReadWriteCSR
- FRM_DYN,
- GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
- def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
- (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
- vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
- vti.RegClass:$rd, undef, VLOpFrag),
- (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
- vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm),
- // Value to indicate no rounding mode change in
- // RISCVInsertReadWriteCSR
- FRM_DYN,
- GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
- }
- }
-}
-
-multiclass VPatWidenFPMulAccVL_VV_VF<SDNode vop, string instruction_name> {
- foreach vtiToWti = AllWidenableFloatVectors in {
- defvar vti = vtiToWti.Vti;
- defvar wti = vtiToWti.Wti;
- let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
- GetVTypePredicates<wti>.Predicates) in {
- def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
- (vti.Vector vti.RegClass:$rs2),
- (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
- VLOpFrag),
- (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX #"_MASK")
- wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
- def : Pat<(vop (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
- (vti.Vector vti.RegClass:$rs2),
- (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
- VLOpFrag),
- (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX #"_MASK")
- wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
- (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
- }
- }
-}
-
multiclass VPatWidenFPMulAccVL_VV_VF_RM<SDNode vop, string instruction_name,
list<VTypeInfoToWide> vtiToWtis =
AllWidenableFloatVectors> {
@@ -2331,8 +2182,6 @@ defm : VPatBinaryWVL_VV_VX<riscv_vwmulsu_vl, "PseudoVWMULSU">;
// 11.13 Vector Single-Width Integer Multiply-Add Instructions
defm : VPatMultiplyAddVL_VV_VX<riscv_add_vl, "PseudoVMADD">;
defm : VPatMultiplyAddVL_VV_VX<riscv_sub_vl, "PseudoVNMSUB">;
-defm : VPatMultiplyAccVL_VV_VX<riscv_add_vl_oneuse, "PseudoVMACC">;
-defm : VPatMultiplyAccVL_VV_VX<riscv_sub_vl_oneuse, "PseudoVNMSAC">;
// 11.14. Vector Widening Integer Multiply-Add Instructions
defm : VPatWidenMultiplyAddVL_VV_VX<riscv_vwmacc_vl, "PseudoVWMACC">;
@@ -2470,10 +2319,6 @@ defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfmadd_vl, "PseudoVFMADD">;
defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfmsub_vl, "PseudoVFMSUB">;
defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfnmadd_vl, "PseudoVFNMADD">;
defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfnmsub_vl, "PseudoVFNMSUB">;
-defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfmadd_vl_oneuse, "PseudoVFMACC">;
-defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfmsub_vl_oneuse, "PseudoVFMSAC">;
-defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfnmadd_vl_oneuse, "PseudoVFNMACC">;
-defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfnmsub_vl_oneuse, "PseudoVFNMSAC">;
// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmadd_vl, "PseudoVFWMACC">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
index 889ea9802257..d615094329b2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
@@ -125,10 +125,25 @@ class Mips_prefetch_ri<dag outs, dag ins, string opcodestr, string argstr>
let Inst{6-0} = OPC_CUSTOM_0.Value;
}
+// MIPS Custom Barrier Insns Format.
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+class MIPSExtInst_ri<bits<6> shimm5, string opcodestr>
+ : RVInstIShift<0b00000, 0b001, OPC_OP_IMM, (outs), (ins), opcodestr, ""> {
+ let shamt = shimm5;
+ let rd = 0;
+ let rs1 = 0;
+}
+
//===----------------------------------------------------------------------===//
// MIPS extensions
//===----------------------------------------------------------------------===//
-let Predicates = [HasVendorXMIPSCBOP] ,DecoderNamespace = "Xmipscbop" in {
+let Predicates = [HasVendorXMIPSEXECTL], DecoderNamespace = "XMIPS" in {
+ def MIPS_EHB : MIPSExtInst_ri<0b000011, "mips.ehb">;
+ def MIPS_IHB : MIPSExtInst_ri<0b000001, "mips.ihb">;
+ def MIPS_PAUSE : MIPSExtInst_ri<0b000101, "mips.pause">;
+}
+
+let Predicates = [HasVendorXMIPSCBOP], DecoderNamespace = "XMIPS" in {
def MIPS_PREF : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint),
"mips.pref", "$hint, ${imm9}(${rs1})">,
Sched<[]>;
@@ -146,7 +161,7 @@ let Predicates = [HasVendorXMIPSCBOP] in {
}
let Predicates = [HasVendorXMIPSCMov], hasSideEffects = 0, mayLoad = 0, mayStore = 0,
- DecoderNamespace = "Xmipscmov" in {
+ DecoderNamespace = "XMIPS" in {
def MIPS_CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd),
(ins GPR:$rs1, GPR:$rs2, GPR:$rs3),
"mips.ccmov", "$rd, $rs2, $rs1, $rs3">,
@@ -166,7 +181,7 @@ def : Pat<(select (XLenVT GPR:$rs2), (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
}
let Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0,
- DecoderNamespace = "Xmipslsp" in {
+ DecoderNamespace = "XMIPS" in {
let mayLoad = 1, mayStore = 0 in {
def MIPS_LWP : LWPFormat<(outs GPR:$rd1, GPR:$rd2), (ins GPR:$rs1, uimm7_lsb00:$imm7),
"mips.lwp", "$rd1, $rd2, ${imm7}(${rs1})">,
@@ -184,4 +199,4 @@ def MIPS_SDP : SDPFormat<(outs), (ins GPR:$rs2, GPR:$rs3, GPR:$rs1, uimm7_lsb000
"mips.sdp", "$rs2, $rs3, ${imm7}(${rs1})">,
Sched<[WriteSTD, ReadStoreData, ReadStoreData, ReadMemBase]>;
} // mayLoad = 0, mayStore = 1
-} // Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0, DecoderNamespace = "Xmipslsp"
+} // Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0, DecoderNamespace = "XMIPS"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 2c64b0c220fb..69796a68ecd6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -22,6 +22,15 @@ def SDT_SetMultiple : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
def qc_setwmi : RVSDNode<"QC_SETWMI", SDT_SetMultiple,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def qc_insb : RVSDNode<"QC_INSB", SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisVT<0, i32>,
+ SDTCisInt<3>,
+ SDTCisInt<4>]>,
+ []>;
+
+def qc_e_li : RVSDNode<"QC_E_LI", SDTIntUnaryOp>;
+
def uimm5nonzero : RISCVOp<XLenVT>,
ImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]> {
let ParserMatchClass = UImmAsmOperand<5, "NonZero">;
@@ -1508,6 +1517,11 @@ def : Pat<(i32 (and GPRNoX0:$rs, 1023)), (QC_EXTU GPRNoX0:$rs, 10, 0)>;
def : Pat<(i32 (and GPRNoX0:$rs, 2047)), (QC_EXTU GPRNoX0:$rs, 11, 0)>;
def : Pat<(i32 (bitreverse GPRNoX0:$rs1)), (QC_BREV32 GPRNoX0:$rs1)>;
+
+def : Pat<(qc_insb GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width, uimm5:$shamt),
+ (QC_INSBI GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width, uimm5:$shamt)>;
+def : Pat<(qc_insb GPRNoX0:$rd, GPR:$rs1, uimm5_plus1:$width, uimm5:$shamt),
+ (QC_INSB GPRNoX0:$rd, GPR:$rs1, uimm5_plus1:$width, uimm5:$shamt)>;
} // Predicates = [HasVendorXqcibm, IsRV32]
// If Zbb is enabled sext.b/h is preferred since they are compressible
@@ -1605,6 +1619,13 @@ def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uim
(QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>;
} // Predicates = [HasVendorXqcilsm, IsRV32]
+let Predicates = [HasVendorXqcili, IsRV32] in {
+def: Pat<(qc_e_li tglobaladdr:$A), (QC_E_LI bare_simm32:$A)>;
+def: Pat<(qc_e_li tblockaddress:$A), (QC_E_LI bare_simm32:$A)>;
+def: Pat<(qc_e_li tjumptable:$A), (QC_E_LI bare_simm32:$A)>;
+def: Pat<(qc_e_li tconstpool:$A), (QC_E_LI bare_simm32:$A)>;
+} // Predicates = [HasVendorXqcili, IsRV32]
+
//===----------------------------------------------------------------------===/i
// Compress Instruction tablegen backend.
//===----------------------------------------------------------------------===//
@@ -1738,10 +1759,19 @@ def : CompressPat<(QC_E_XORAI GPRNoX0:$rd, simm12:$imm),
(XORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
} // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32]
-let Predicates = [HasVendorXqciac, IsRV32] in {
+let isCompressOnly = true, Predicates = [HasVendorXqciac, IsRV32] in {
def : CompressPat<(QC_MULIADD GPRC:$rd, GPRC:$rs1, uimm5:$imm5),
(QC_C_MULIADD GPRC:$rd, GPRC:$rs1, uimm5:$imm5)>;
-}
+} // isCompressOnly = true, Predicates = [HasVendorXqciac, IsRV32]
+
+let isCompressOnly = true, Predicates = [HasVendorXqciac, HasStdExtZba, IsRV32] in {
+def : CompressPat<(SH1ADD GPRC:$rd, GPRC:$rs1, GPRC:$rd),
+ (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, 2)>;
+def : CompressPat<(SH2ADD GPRC:$rd, GPRC:$rs1, GPRC:$rd),
+ (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, 4)>;
+def : CompressPat<(SH3ADD GPRC:$rd, GPRC:$rs1, GPRC:$rd),
+ (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, 8)>;
+} // isCompressOnly = true, Predicates = [HasVendorXqciac, HasStdExtZba, IsRV32]
let isCompressOnly = true, Predicates = [HasVendorXqcibi, IsRV32] in {
def : CompressPat<(QC_E_BEQI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td
index a43cbadf6f30..bb1862cc88d6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td
@@ -106,6 +106,7 @@ def QK_C_LBUSP : QKStackInst<0b00, (outs GPRC:$rd_rs2),
(ins SPMem:$rs1, uimm4:$imm),
"qk.c.lbusp", "$rd_rs2, ${imm}(${rs1})">,
Sched<[WriteLDB, ReadMemBase]> {
+ bits<0> rs1;
bits<4> imm;
let Inst{10-7} = imm;
}
@@ -115,6 +116,7 @@ def QK_C_SBSP : QKStackInst<0b10, (outs),
uimm4:$imm),
"qk.c.sbsp", "$rd_rs2, ${imm}(${rs1})">,
Sched<[WriteSTB, ReadStoreData, ReadMemBase]> {
+ bits<0> rs1;
bits<4> imm;
let Inst{10-7} = imm;
}
@@ -124,6 +126,7 @@ def QK_C_LHUSP : QKStackInst<0b01, (outs GPRC:$rd_rs2),
(ins SPMem:$rs1, uimm5_lsb0:$imm),
"qk.c.lhusp", "$rd_rs2, ${imm}(${rs1})">,
Sched<[WriteLDH, ReadMemBase]> {
+ bits<0> rs1;
bits<5> imm;
let Inst{10-8} = imm{3-1};
let Inst{7} = imm{4};
@@ -133,6 +136,7 @@ def QK_C_SHSP : QKStackInst<0b11, (outs),
(ins GPRC:$rd_rs2, SPMem:$rs1, uimm5_lsb0:$imm),
"qk.c.shsp", "$rd_rs2, ${imm}(${rs1})">,
Sched<[WriteSTH, ReadStoreData, ReadMemBase]> {
+ bits<0> rs1;
bits<5> imm;
let Inst{10-8} = imm{3-1};
let Inst{7} = imm{4};
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 2abd3e613a03..a2b4302e19ed 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -459,15 +459,15 @@ let Predicates = [HasStdExtZba, IsRV64] in {
def : InstAlias<"zext.w $rd, $rs", (ADD_UW GPR:$rd, GPR:$rs, X0)>;
} // Predicates = [HasStdExtZba, IsRV64]
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrZbkb] in {
def : InstAlias<"ror $rd, $rs1, $shamt",
- (RORI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
-} // Predicates = [HasStdExtZbb]
+ (RORI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
+} // Predicates = [HasStdExtZbbOrZbkb]
-let Predicates = [HasStdExtZbb, IsRV64] in {
+let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
def : InstAlias<"rorw $rd, $rs1, $shamt",
- (RORIW GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+ (RORIW GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>;
+} // Predicates = [HasStdExtZbbOrZbkb, IsRV64]
let Predicates = [HasStdExtZbs] in {
def : InstAlias<"bset $rd, $rs1, $shamt",
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
index 32e7f962aa2a..76dc027ffd1d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
@@ -22,5 +22,5 @@ class CMOPInst<bits<3> imm3, string opcodestr>
foreach n = [1, 3, 5, 7, 9, 11, 13, 15] in {
let Predicates = [HasStdExtZcmop] in
- def C_MOP # n : CMOPInst<!srl(n, 1), "c.mop." # n>, Sched<[]>;
+ def C_MOP_ # n : CMOPInst<!srl(n, 1), "c.mop." # n>, Sched<[]>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
index 49a57f86cccd..50ebaa995197 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
@@ -62,6 +62,21 @@ defm SSAMOSWAP_W : AMO_rr_aq_rl<0b01001, 0b010, "ssamoswap.w">;
let Predicates = [HasStdExtZicfiss, IsRV64] in
defm SSAMOSWAP_D : AMO_rr_aq_rl<0b01001, 0b011, "ssamoswap.d">;
+let Predicates = [HasStdExtZimop] in {
+let hasSideEffects = 1, mayLoad = 0, mayStore = 1 in
+def PseudoMOP_SSPUSH : Pseudo<(outs), (ins GPRX1X5:$rs2), []>,
+ PseudoInstExpansion<(MOP_RR_7 X0, X0, GPR:$rs2)>;
+let hasSideEffects = 1, mayLoad = 1, mayStore = 0 in
+def PseudoMOP_SSPOPCHK : Pseudo<(outs), (ins GPRX1X5:$rs1), []>,
+ PseudoInstExpansion<(MOP_R_28 X0, GPR:$rs1)>;
+} // Predicates = [HasStdExtZimop]
+
+let Predicates = [HasStdExtZcmop] in {
+let Uses = [X1], hasSideEffects = 1, mayLoad = 0, mayStore = 1 in
+def PseudoMOP_C_SSPUSH : Pseudo<(outs), (ins), []>,
+ PseudoInstExpansion<(C_MOP_1)>;
+} // Predicates = [HasStdExtZcmop]
+
//===----------------------------------------------------------------------===/
// Compress Instruction tablegen backend.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
index 960f5669b488..0d08176f9799 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
@@ -33,13 +33,13 @@ class RVInstRMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3, RISCVOpcode opcod
}
// May-Be-Operations
-def riscv_mopr : RVSDNode<"MOPR",
- SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
- SDTCisSameAs<0, 2>]>>;
-def riscv_moprr : RVSDNode<"MOPRR",
- SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
- SDTCisSameAs<0, 2>,
- SDTCisSameAs<0, 3>]>>;
+def riscv_mop_r : RVSDNode<"MOP_R",
+ SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>]>>;
+def riscv_mop_rr : RVSDNode<"MOP_RR",
+ SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3,
@@ -50,31 +50,32 @@ class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3,
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3,
RISCVOpcode opcode, string opcodestr>
- : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
+ : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd),
+ (ins GPR:$rs1, GPR:$rs2),
opcodestr, "$rd, $rs1, $rs2">;
foreach i = 0...31 in {
let Predicates = [HasStdExtZimop] in
- def MOPR#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>,
- Sched<[]>;
+ def MOP_R_#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>,
+ Sched<[]>;
}
foreach i = 0...7 in {
let Predicates = [HasStdExtZimop] in
- def MOPRR#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>,
+ def MOP_RR_#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>,
Sched<[]>;
}
let Predicates = [HasStdExtZimop] in {
// Zimop instructions
foreach i = 0...31 in {
- def : Pat<(XLenVT (riscv_mopr GPR:$rs1, (XLenVT i))),
- (!cast<Instruction>("MOPR"#i) GPR:$rs1)>;
+ def : Pat<(XLenVT (riscv_mop_r GPR:$rs1, (XLenVT i))),
+ (!cast<Instruction>("MOP_R_"#i) GPR:$rs1)>;
}
foreach i = 0...7 in {
- def : Pat<(XLenVT (riscv_moprr GPR:$rs1, GPR:$rs2, (XLenVT i))),
- (!cast<Instruction>("MOPRR"#i) GPR:$rs1, GPR:$rs2)>;
+ def : Pat<(XLenVT (riscv_mop_rr GPR:$rs1, GPR:$rs2, (XLenVT i))),
+ (!cast<Instruction>("MOP_RR_"#i) GPR:$rs1, GPR:$rs2)>;
}
} // Predicates = [HasStdExtZimop]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td
index 27959eaccd90..00c4e83e18a0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td
@@ -17,16 +17,39 @@
// Instructions
//===----------------------------------------------------------------------===//
+class VQDOTVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+ : RVInstVV<funct6, opv, (outs VR:$vd_wb),
+ (ins VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm),
+ opcodestr, "$vd, $vs2, $vs1$vm"> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let Constraints = "$vd = $vd_wb";
+}
+
+class VQDOTVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+ : RVInstVX<funct6, opv, (outs VR:$vd_wb),
+ (ins VR:$vd, VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+ opcodestr, "$vd, $vs2, $rs1$vm"> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let Constraints = "$vd = $vd_wb";
+}
+
let Predicates = [HasStdExtZvqdotq] in {
- def VQDOT_VV : VALUVV<0b101100, OPMVV, "vqdot.vv">;
- def VQDOT_VX : VALUVX<0b101100, OPMVX, "vqdot.vx">;
- def VQDOTU_VV : VALUVV<0b101000, OPMVV, "vqdotu.vv">;
- def VQDOTU_VX : VALUVX<0b101000, OPMVX, "vqdotu.vx">;
- def VQDOTSU_VV : VALUVV<0b101010, OPMVV, "vqdotsu.vv">;
- def VQDOTSU_VX : VALUVX<0b101010, OPMVX, "vqdotsu.vx">;
- def VQDOTUS_VX : VALUVX<0b101110, OPMVX, "vqdotus.vx">;
+ def VQDOT_VV : VQDOTVV<0b101100, OPMVV, "vqdot.vv">;
+ def VQDOT_VX : VQDOTVX<0b101100, OPMVX, "vqdot.vx">;
+ def VQDOTU_VV : VQDOTVV<0b101000, OPMVV, "vqdotu.vv">;
+ def VQDOTU_VX : VQDOTVX<0b101000, OPMVX, "vqdotu.vx">;
+ def VQDOTSU_VV : VQDOTVV<0b101010, OPMVV, "vqdotsu.vv">;
+ def VQDOTSU_VX : VQDOTVX<0b101010, OPMVX, "vqdotsu.vx">;
+ def VQDOTUS_VX : VQDOTVX<0b101110, OPMVX, "vqdotus.vx">;
} // Predicates = [HasStdExtZvqdotq]
+//===----------------------------------------------------------------------===//
+// Helpers to define the VL patterns.
+//===----------------------------------------------------------------------===//
let HasPassthruOp = true, HasMaskOp = true in {
def riscv_vqdot_vl : RVSDNode<"VQDOT_VL", SDT_RISCVIntBinOp_VL>;
@@ -34,6 +57,10 @@ let HasPassthruOp = true, HasMaskOp = true in {
def riscv_vqdotsu_vl : RVSDNode<"VQDOTSU_VL", SDT_RISCVIntBinOp_VL>;
} // let HasPassthruOp = true, HasMaskOp = true
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions for CodeGen
+//===----------------------------------------------------------------------===//
+
multiclass VPseudoVQDOT_VV_VX {
foreach m = MxSet<32>.m in {
defm "" : VPseudoBinaryV_VV<m>,
@@ -52,10 +79,69 @@ let Predicates = [HasStdExtZvqdotq], mayLoad = 0, mayStore = 0,
defm PseudoVQDOT : VPseudoVQDOT_VV_VX;
defm PseudoVQDOTU : VPseudoVQDOT_VV_VX;
defm PseudoVQDOTSU : VPseudoVQDOT_VV_VX;
+ // VQDOTUS does not have a VV variant
+ foreach m = MxListVF4 in {
+ defm "PseudoVQDOTUS_VX" : VPseudoTernaryWithPolicy<m.vrclass, m.vrclass, GPR, m>;
+ }
}
+//===----------------------------------------------------------------------===//
+// Patterns.
+//===----------------------------------------------------------------------===//
+
defvar AllE32Vectors = [VI32MF2, VI32M1, VI32M2, VI32M4, VI32M8];
defm : VPatBinaryVL_VV_VX<riscv_vqdot_vl, "PseudoVQDOT", AllE32Vectors>;
defm : VPatBinaryVL_VV_VX<riscv_vqdotu_vl, "PseudoVQDOTU", AllE32Vectors>;
defm : VPatBinaryVL_VV_VX<riscv_vqdotsu_vl, "PseudoVQDOTSU", AllE32Vectors>;
+// These VPat definitions are for vqdot because they have a different operand
+// order with other ternary instructions (i.e. vop.vx vd, vs2, rs1)
+multiclass VPatTernaryV_VX_AABX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> info_pairs> {
+ foreach pair = info_pairs in {
+ defvar VdInfo = pair.Wti;
+ defvar Vs2Info = pair.Vti;
+ let Predicates = GetVTypePredicates<VdInfo>.Predicates in
+ defm : VPatTernaryWithPolicy<intrinsic, instruction,
+ "V"#VdInfo.ScalarSuffix,
+ VdInfo.Vector, Vs2Info.Vector, Vs2Info.Scalar,
+ VdInfo.Mask, VdInfo.Log2SEW, VdInfo.LMul,
+ VdInfo.RegClass, Vs2Info.RegClass,
+ Vs2Info.ScalarRegClass>;
+ }
+}
+
+multiclass VPatTernaryV_VV_AABX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> info_pairs> {
+ foreach pair = info_pairs in {
+ defvar VdInfo = pair.Wti;
+ defvar Vs2Info = pair.Vti;
+ let Predicates = GetVTypePredicates<VdInfo>.Predicates in
+ defm : VPatTernaryWithPolicy<intrinsic, instruction,
+ "VV",
+ VdInfo.Vector, Vs2Info.Vector, Vs2Info.Vector,
+ VdInfo.Mask, VdInfo.Log2SEW, VdInfo.LMul,
+ VdInfo.RegClass, Vs2Info.RegClass,
+ Vs2Info.RegClass>;
+ }
+}
+
+multiclass VPatTernaryV_VV_VX_AABX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> info_pairs>
+ : VPatTernaryV_VV_AABX<intrinsic, instruction, info_pairs>,
+ VPatTernaryV_VX_AABX<intrinsic, instruction, info_pairs>;
+
+defset list<VTypeInfoToWide> VQDOTInfoPairs = {
+ def : VTypeInfoToWide<VI8MF2, VI32MF2>;
+ def : VTypeInfoToWide<VI8M1, VI32M1>;
+ def : VTypeInfoToWide<VI8M2, VI32M2>;
+ def : VTypeInfoToWide<VI8M4, VI32M4>;
+ def : VTypeInfoToWide<VI8M8, VI32M8>;
+}
+
+let Predicates = [HasStdExtZvqdotq] in {
+ defm : VPatTernaryV_VV_VX_AABX<"int_riscv_vqdot", "PseudoVQDOT", VQDOTInfoPairs>;
+ defm : VPatTernaryV_VV_VX_AABX<"int_riscv_vqdotu", "PseudoVQDOTU", VQDOTInfoPairs>;
+ defm : VPatTernaryV_VV_VX_AABX<"int_riscv_vqdotsu", "PseudoVQDOTSU", VQDOTInfoPairs>;
+ defm : VPatTernaryV_VX_AABX<"int_riscv_vqdotus", "PseudoVQDOTUS", VQDOTInfoPairs>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 4abe62f4e874..06309262f1b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -148,6 +148,14 @@ def isNonZeroLoadImmediate
CheckNot<CheckImmOperand<2, 0>>
]>>>;
+def isLPAD
+ : TIIPredicate<"isLPAD",
+ MCReturnStatement<CheckAll<[
+ CheckOpcode<[AUIPC]>,
+ CheckIsRegOperand<0>,
+ CheckRegOperand<0, X0>,
+ ]>>>;
+
def ignoresVXRM
: TIIPredicate<"ignoresVXRM",
MCOpcodeSwitchStatement<
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index c7b96f5c3d0c..5e1063155ba0 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -81,6 +81,12 @@ static const Intrinsic::ID FixedVssegIntrIds[] = {
Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
Intrinsic::riscv_seg8_store_mask};
+static const Intrinsic::ID FixedVsssegIntrIds[] = {
+ Intrinsic::riscv_sseg2_store_mask, Intrinsic::riscv_sseg3_store_mask,
+ Intrinsic::riscv_sseg4_store_mask, Intrinsic::riscv_sseg5_store_mask,
+ Intrinsic::riscv_sseg6_store_mask, Intrinsic::riscv_sseg7_store_mask,
+ Intrinsic::riscv_sseg8_store_mask};
+
static const Intrinsic::ID ScalableVssegIntrIds[] = {
Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
@@ -275,7 +281,16 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
- unsigned Factor) const {
+ unsigned Factor,
+ const APInt &GapMask) const {
+ assert(GapMask.getBitWidth() == Factor);
+
+ // We only support cases where the skipped fields are the trailing ones.
+ // TODO: Lower to strided store if there is only a single active field.
+ unsigned MaskFactor = GapMask.popcount();
+ if (MaskFactor < 2 || !GapMask.isMask())
+ return false;
+
IRBuilder<> Builder(Store);
const DataLayout &DL = Store->getDataLayout();
auto Mask = SVI->getShuffleMask();
@@ -287,21 +302,31 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *Ptr, *VL;
Align Alignment;
- if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment))
+ if (!getMemOperands(MaskFactor, VTy, XLenTy, Store, Ptr, LaneMask, VL,
+ Alignment))
return false;
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
- if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+ if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL))
return false;
- Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
- Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
+ Function *SegStoreFunc;
+ if (MaskFactor < Factor)
+ // Strided segmented store.
+ SegStoreFunc = Intrinsic::getOrInsertDeclaration(
+ Store->getModule(), FixedVsssegIntrIds[MaskFactor - 2],
+ {VTy, PtrTy, XLenTy, XLenTy});
+ else
+ // Normal segmented store.
+ SegStoreFunc = Intrinsic::getOrInsertDeclaration(
+ Store->getModule(), FixedVssegIntrIds[Factor - 2],
+ {VTy, PtrTy, XLenTy});
SmallVector<Value *, 10> Ops;
SmallVector<int, 16> NewShuffleMask;
- for (unsigned i = 0; i < Factor; i++) {
+ for (unsigned i = 0; i < MaskFactor; i++) {
// Collect shuffle mask for this lane.
for (unsigned j = 0; j < VTy->getNumElements(); j++)
NewShuffleMask.push_back(Mask[i + Factor * j]);
@@ -312,8 +337,14 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
NewShuffleMask.clear();
}
- Ops.append({Ptr, LaneMask, VL});
- Builder.CreateCall(VssegNFunc, Ops);
+ Ops.push_back(Ptr);
+ if (MaskFactor < Factor) {
+ // Insert the stride argument.
+ unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
+ Ops.push_back(ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes));
+ }
+ Ops.append({LaneMask, VL});
+ Builder.CreateCall(SegStoreFunc, Ops);
return true;
}
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 3b19c3456ad6..d08115b72977 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -356,6 +356,14 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
return false;
Worklist.emplace_back(UserMI, Bits);
break;
+ case RISCV::TH_EXT:
+ case RISCV::TH_EXTU:
+ unsigned Msb = UserMI->getOperand(2).getImm();
+ unsigned Lsb = UserMI->getOperand(3).getImm();
+ // Behavior of Msb < Lsb is not well documented.
+ if (Msb >= Lsb && Bits > Msb)
+ break;
+ return false;
}
}
}
@@ -409,6 +417,16 @@ static bool isSignExtendingOpW(const MachineInstr &MI, unsigned OpNo) {
assert(Log2SEW >= 3 && Log2SEW <= 6 && "Unexpected Log2SEW");
return Log2SEW <= 5;
}
+ case RISCV::TH_EXT: {
+ unsigned Msb = MI.getOperand(2).getImm();
+ unsigned Lsb = MI.getOperand(3).getImm();
+ return Msb >= Lsb && (Msb - Lsb + 1) <= 32;
+ }
+ case RISCV::TH_EXTU: {
+ unsigned Msb = MI.getOperand(2).getImm();
+ unsigned Lsb = MI.getOperand(3).getImm();
+ return Msb >= Lsb && (Msb - Lsb + 1) < 32;
+ }
}
return false;
@@ -519,9 +537,11 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
case RISCV::ANDI:
case RISCV::ORI:
case RISCV::XORI:
+ case RISCV::SRAI:
// |Remainder| is always <= |Dividend|. If D is 32-bit, then so is R.
// DIV doesn't work because of the edge case 0xf..f 8000 0000 / (long)-1
// Logical operations use a sign extended 12-bit immediate.
+ // Arithmetic shift right can only increase the number of sign bits.
if (!AddRegToWorkList(MI->getOperand(1).getReg()))
return false;
@@ -556,6 +576,9 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
case RISCV::PseudoCCAND:
case RISCV::PseudoCCOR:
case RISCV::PseudoCCXOR:
+ case RISCV::PseudoCCANDN:
+ case RISCV::PseudoCCORN:
+ case RISCV::PseudoCCXNOR:
case RISCV::PHI: {
// If all incoming values are sign-extended, the output of AND, OR, XOR,
// MIN, MAX, or PHI is also sign-extended.
@@ -578,6 +601,9 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
case RISCV::PseudoCCAND:
case RISCV::PseudoCCOR:
case RISCV::PseudoCCXOR:
+ case RISCV::PseudoCCANDN:
+ case RISCV::PseudoCCORN:
+ case RISCV::PseudoCCXNOR:
B = 4;
E = 7;
break;
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index f89d94f41b69..36d63ed23b92 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -121,7 +121,8 @@ def MIPS_P8700 : RISCVProcessorModel<"mips-p8700",
FeatureStdExtZicsr,
FeatureVendorXMIPSCMov,
FeatureVendorXMIPSLSP,
- FeatureVendorXMIPSCBOP],
+ FeatureVendorXMIPSCBOP,
+ FeatureVendorXMIPSEXECTL],
[TuneMIPSP8700]>;
def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32",
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index f3966a55ce7d..40b641680b2c 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -966,7 +966,9 @@ bool RISCVRegisterInfo::getRegAllocationHints(
}
}
- // Add a hint if it would allow auipc/lui+addi(w) fusion.
+ // Add a hint if it would allow auipc/lui+addi(w) fusion. We do this even
+ // without the fusions explicitly enabled as the impact is rarely negative
+ // and some cores do implement this fusion.
if ((MI.getOpcode() == RISCV::ADDIW || MI.getOpcode() == RISCV::ADDI) &&
MI.getOperand(1).isReg()) {
const MachineBasicBlock &MBB = *MI.getParent();
@@ -974,9 +976,7 @@ bool RISCVRegisterInfo::getRegAllocationHints(
// Is the previous instruction a LUI or AUIPC that can be fused?
if (I != MBB.begin()) {
I = skipDebugInstructionsBackward(std::prev(I), MBB.begin());
- if (((I->getOpcode() == RISCV::LUI && Subtarget.hasLUIADDIFusion()) ||
- (I->getOpcode() == RISCV::AUIPC &&
- Subtarget.hasAUIPCADDIFusion())) &&
+ if ((I->getOpcode() == RISCV::LUI || I->getOpcode() == RISCV::AUIPC) &&
I->getOperand(0).getReg() == MI.getOperand(1).getReg()) {
if (OpIdx == 0)
tryAddHint(MO, MI.getOperand(1), /*NeedGPRC=*/false);
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index fd57e02c25d0..50e76df56e57 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -186,6 +186,12 @@ public:
return HasStdExtZfhmin || HasStdExtZfbfmin;
}
+ bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; }
+
+ bool hasCZEROLike() const {
+ return HasStdExtZicond || HasVendorXVentanaCondOps;
+ }
+
bool hasConditionalMoveFusion() const {
// Do we support fusing a branch+mv or branch+c.mv as a conditional move.
return (hasConditionalCompressedMoveFusion() && hasStdExtZca()) ||
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index d70b1d0dc8d5..460bb33f2553 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -652,7 +652,8 @@ void RISCVPassConfig::addPostRegAlloc() {
void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM,
OptimizationLevel Level) {
- LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
+ if (Level != OptimizationLevel::O0)
+ LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
});
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index c707fb110b10..1ca513214f67 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1566,6 +1566,18 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
+InstructionCost
+RISCVTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const {
+ // Address computations for vector indexed load/store likely require an offset
+ // and/or scaling.
+ if (ST->hasVInstructions() && PtrTy->isVectorTy())
+ return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
+
+ return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
+}
+
InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
Type *Src,
TTI::CastContextHint CCH,
@@ -2731,6 +2743,10 @@ unsigned RISCVTTIImpl::getMinTripCountTailFoldingThreshold() const {
return RVVMinTripCount;
}
+bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
+ return ST->enableUnalignedVectorMem();
+}
+
TTI::AddressingModeKind
RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,
ScalarEvolution *SE) const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 3236b2a35c85..6bd7d51daff6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -132,7 +132,7 @@ public:
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
- bool preferAlternateOpcodeVectorization() const override { return false; }
+ bool preferAlternateOpcodeVectorization() const override;
bool preferEpilogueVectorization() const override {
// Epilogue vectorization is usually unprofitable - tail folding or
@@ -177,6 +177,10 @@ public:
getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) const override;
+ InstructionCost
+ getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const override;
+
InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 53557049ea33..29526cf5a527 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -178,8 +178,20 @@ static unsigned getIntegerExtensionOperandEEW(unsigned Factor,
return Log2EEW;
}
-static std::optional<unsigned>
-getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
+#define VSEG_CASES(Prefix, EEW) \
+ RISCV::Prefix##SEG2E##EEW##_V: \
+ case RISCV::Prefix##SEG3E##EEW##_V: \
+ case RISCV::Prefix##SEG4E##EEW##_V: \
+ case RISCV::Prefix##SEG5E##EEW##_V: \
+ case RISCV::Prefix##SEG6E##EEW##_V: \
+ case RISCV::Prefix##SEG7E##EEW##_V: \
+ case RISCV::Prefix##SEG8E##EEW##_V
+#define VSSEG_CASES(EEW) VSEG_CASES(VS, EEW)
+#define VSSSEG_CASES(EEW) VSEG_CASES(VSS, EEW)
+#define VSUXSEG_CASES(EEW) VSEG_CASES(VSUX, I##EEW)
+#define VSOXSEG_CASES(EEW) VSEG_CASES(VSOX, I##EEW)
+
+static std::optional<unsigned> getOperandLog2EEW(const MachineOperand &MO) {
const MachineInstr &MI = *MO.getParent();
const MCInstrDesc &Desc = MI.getDesc();
const RISCVVPseudosTable::PseudoInfo *RVV =
@@ -225,21 +237,29 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
case RISCV::VSE8_V:
case RISCV::VLSE8_V:
case RISCV::VSSE8_V:
+ case VSSEG_CASES(8):
+ case VSSSEG_CASES(8):
return 3;
case RISCV::VLE16_V:
case RISCV::VSE16_V:
case RISCV::VLSE16_V:
case RISCV::VSSE16_V:
+ case VSSEG_CASES(16):
+ case VSSSEG_CASES(16):
return 4;
case RISCV::VLE32_V:
case RISCV::VSE32_V:
case RISCV::VLSE32_V:
case RISCV::VSSE32_V:
+ case VSSEG_CASES(32):
+ case VSSSEG_CASES(32):
return 5;
case RISCV::VLE64_V:
case RISCV::VSE64_V:
case RISCV::VLSE64_V:
case RISCV::VSSE64_V:
+ case VSSEG_CASES(64):
+ case VSSSEG_CASES(64):
return 6;
// Vector Indexed Instructions
@@ -248,7 +268,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
case RISCV::VLUXEI8_V:
case RISCV::VLOXEI8_V:
case RISCV::VSUXEI8_V:
- case RISCV::VSOXEI8_V: {
+ case RISCV::VSOXEI8_V:
+ case VSUXSEG_CASES(8):
+ case VSOXSEG_CASES(8): {
if (MO.getOperandNo() == 0)
return MILog2SEW;
return 3;
@@ -256,7 +278,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
case RISCV::VLUXEI16_V:
case RISCV::VLOXEI16_V:
case RISCV::VSUXEI16_V:
- case RISCV::VSOXEI16_V: {
+ case RISCV::VSOXEI16_V:
+ case VSUXSEG_CASES(16):
+ case VSOXSEG_CASES(16): {
if (MO.getOperandNo() == 0)
return MILog2SEW;
return 4;
@@ -264,7 +288,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
case RISCV::VLUXEI32_V:
case RISCV::VLOXEI32_V:
case RISCV::VSUXEI32_V:
- case RISCV::VSOXEI32_V: {
+ case RISCV::VSOXEI32_V:
+ case VSUXSEG_CASES(32):
+ case VSOXSEG_CASES(32): {
if (MO.getOperandNo() == 0)
return MILog2SEW;
return 5;
@@ -272,7 +298,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
case RISCV::VLUXEI64_V:
case RISCV::VLOXEI64_V:
case RISCV::VSUXEI64_V:
- case RISCV::VSOXEI64_V: {
+ case RISCV::VSOXEI64_V:
+ case VSUXSEG_CASES(64):
+ case VSOXSEG_CASES(64): {
if (MO.getOperandNo() == 0)
return MILog2SEW;
return 6;
@@ -422,9 +450,6 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
case RISCV::VRGATHER_VI:
case RISCV::VRGATHER_VV:
case RISCV::VRGATHER_VX:
- // Vector Compress Instruction
- // EEW=SEW.
- case RISCV::VCOMPRESS_VM:
// Vector Element Index Instruction
case RISCV::VID_V:
// Vector Single-Width Floating-Point Add/Subtract Instructions
@@ -674,6 +699,12 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
return MILog2SEW;
}
+ // Vector Compress Instruction
+ // EEW=SEW, except the mask operand has EEW=1. Mask operand is not handled
+ // before this switch.
+ case RISCV::VCOMPRESS_VM:
+ return MO.getOperandNo() == 3 ? 0 : MILog2SEW;
+
// Vector Iota Instruction
// EEW=SEW, except the mask operand has EEW=1. Mask operand is not handled
// before this switch.
@@ -778,14 +809,13 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
}
}
-static std::optional<OperandInfo>
-getOperandInfo(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
+static std::optional<OperandInfo> getOperandInfo(const MachineOperand &MO) {
const MachineInstr &MI = *MO.getParent();
const RISCVVPseudosTable::PseudoInfo *RVV =
RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
assert(RVV && "Could not find MI in PseudoTable");
- std::optional<unsigned> Log2EEW = getOperandLog2EEW(MO, MRI);
+ std::optional<unsigned> Log2EEW = getOperandLog2EEW(MO);
if (!Log2EEW)
return std::nullopt;
@@ -900,13 +930,6 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VSEXT_VF4:
case RISCV::VZEXT_VF8:
case RISCV::VSEXT_VF8:
- // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
- // FIXME: Add support
- case RISCV::VMADC_VV:
- case RISCV::VMADC_VI:
- case RISCV::VMADC_VX:
- case RISCV::VMSBC_VV:
- case RISCV::VMSBC_VX:
// Vector Narrowing Integer Right Shift Instructions
case RISCV::VNSRL_WX:
case RISCV::VNSRL_WI:
@@ -993,6 +1016,11 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VSBC_VXM:
case RISCV::VMSBC_VVM:
case RISCV::VMSBC_VXM:
+ case RISCV::VMADC_VV:
+ case RISCV::VMADC_VI:
+ case RISCV::VMADC_VX:
+ case RISCV::VMSBC_VV:
+ case RISCV::VMSBC_VX:
// Vector Widening Integer Multiply-Add Instructions
case RISCV::VWMACCU_VV:
case RISCV::VWMACCU_VX:
@@ -1001,10 +1029,7 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VWMACCSU_VV:
case RISCV::VWMACCSU_VX:
case RISCV::VWMACCUS_VX:
- // Vector Integer Merge Instructions
- // FIXME: Add support
// Vector Integer Move Instructions
- // FIXME: Add support
case RISCV::VMV_V_I:
case RISCV::VMV_V_X:
case RISCV::VMV_V_V:
@@ -1306,7 +1331,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
// TODO: Use a better approach than a white-list, such as adding
// properties to instructions using something like TSFlags.
if (!isSupportedInstr(MI)) {
- LLVM_DEBUG(dbgs() << "Not a candidate due to unsupported instruction\n");
+ LLVM_DEBUG(dbgs() << "Not a candidate due to unsupported instruction: "
+ << MI);
return false;
}
@@ -1328,14 +1354,14 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
const MCInstrDesc &Desc = UserMI.getDesc();
if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
- LLVM_DEBUG(dbgs() << " Abort due to lack of VL, assume that"
+ LLVM_DEBUG(dbgs() << " Abort due to lack of VL, assume that"
" use VLMAX\n");
return std::nullopt;
}
if (RISCVII::readsPastVL(
TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) {
- LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n");
+ LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n");
return std::nullopt;
}
@@ -1352,7 +1378,7 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
RISCVII::isFirstDefTiedToFirstUse(UserMI.getDesc()));
auto DemandedVL = DemandedVLs.lookup(&UserMI);
if (!DemandedVL || !RISCV::isVLKnownLE(*DemandedVL, VLOp)) {
- LLVM_DEBUG(dbgs() << " Abort because user is passthru in "
+ LLVM_DEBUG(dbgs() << " Abort because user is passthru in "
"instruction with demanded tail\n");
return std::nullopt;
}
@@ -1376,6 +1402,54 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
return VLOp;
}
+/// Return true if MI is an instruction used for assembling registers
+/// for segmented store instructions, namely, RISCVISD::TUPLE_INSERT.
+/// Currently it's lowered to INSERT_SUBREG.
+static bool isTupleInsertInstr(const MachineInstr &MI) {
+ if (!MI.isInsertSubreg())
+ return false;
+
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ const TargetRegisterClass *DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ if (!RISCVRI::isVRegClass(DstRC->TSFlags))
+ return false;
+ unsigned NF = RISCVRI::getNF(DstRC->TSFlags);
+ if (NF < 2)
+ return false;
+
+ // Check whether INSERT_SUBREG has the correct subreg index for tuple inserts.
+ auto VLMul = RISCVRI::getLMul(DstRC->TSFlags);
+ unsigned SubRegIdx = MI.getOperand(3).getImm();
+ [[maybe_unused]] auto [LMul, IsFractional] = RISCVVType::decodeVLMUL(VLMul);
+ assert(!IsFractional && "unexpected LMUL for tuple register classes");
+ return TRI->getSubRegIdxSize(SubRegIdx) == RISCV::RVVBitsPerBlock * LMul;
+}
+
+static bool isSegmentedStoreInstr(const MachineInstr &MI) {
+ switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
+ case VSSEG_CASES(8):
+ case VSSSEG_CASES(8):
+ case VSUXSEG_CASES(8):
+ case VSOXSEG_CASES(8):
+ case VSSEG_CASES(16):
+ case VSSSEG_CASES(16):
+ case VSUXSEG_CASES(16):
+ case VSOXSEG_CASES(16):
+ case VSSEG_CASES(32):
+ case VSSSEG_CASES(32):
+ case VSUXSEG_CASES(32):
+ case VSOXSEG_CASES(32):
+ case VSSEG_CASES(64):
+ case VSSSEG_CASES(64):
+ case VSUXSEG_CASES(64):
+ case VSOXSEG_CASES(64):
+ return true;
+ default:
+ return false;
+ }
+}
+
std::optional<MachineOperand>
RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
std::optional<MachineOperand> CommonVL;
@@ -1396,6 +1470,23 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
continue;
}
+ if (isTupleInsertInstr(UserMI)) {
+ LLVM_DEBUG(dbgs().indent(4) << "Peeking through uses of INSERT_SUBREG\n");
+ for (MachineOperand &UseOp :
+ MRI->use_operands(UserMI.getOperand(0).getReg())) {
+ const MachineInstr &CandidateMI = *UseOp.getParent();
+ // We should not propagate the VL if the user is not a segmented store
+ // or another INSERT_SUBREG, since VL just works differently
+ // between segmented operations (per-field) v.s. other RVV ops (on the
+ // whole register group).
+ if (!isTupleInsertInstr(CandidateMI) &&
+ !isSegmentedStoreInstr(CandidateMI))
+ return std::nullopt;
+ Worklist.insert(&UseOp);
+ }
+ continue;
+ }
+
if (UserMI.isPHI()) {
// Don't follow PHI cycles
if (!PHISeen.insert(&UserMI).second)
@@ -1425,9 +1516,8 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
return std::nullopt;
}
- std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp, MRI);
- std::optional<OperandInfo> ProducerInfo =
- getOperandInfo(MI.getOperand(0), MRI);
+ std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp);
+ std::optional<OperandInfo> ProducerInfo = getOperandInfo(MI.getOperand(0));
if (!ConsumerInfo || !ProducerInfo) {
LLVM_DEBUG(dbgs() << " Abort due to unknown operand information.\n");
LLVM_DEBUG(dbgs() << " ConsumerInfo is: " << ConsumerInfo << "\n");
@@ -1449,7 +1539,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
}
bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
- LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n");
+ LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI);
unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc());
MachineOperand &VLOp = MI.getOperand(VLOpNum);
@@ -1468,14 +1558,23 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) &&
"Expected VL to be an Imm or virtual Reg");
+ // If the VL is defined by a vleff that doesn't dominate MI, try using the
+ // vleff's AVL. It will be greater than or equal to the output VL.
+ if (CommonVL->isReg()) {
+ const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg());
+ if (RISCVInstrInfo::isFaultOnlyFirstLoad(*VLMI) &&
+ !MDT->dominates(VLMI, &MI))
+ CommonVL = VLMI->getOperand(RISCVII::getVLOpNum(VLMI->getDesc()));
+ }
+
if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) {
- LLVM_DEBUG(dbgs() << " Abort due to CommonVL not <= VLOp.\n");
+ LLVM_DEBUG(dbgs() << " Abort due to CommonVL not <= VLOp.\n");
return false;
}
if (CommonVL->isIdenticalTo(VLOp)) {
LLVM_DEBUG(
- dbgs() << " Abort due to CommonVL == VLOp, no point in reducing.\n");
+ dbgs() << " Abort due to CommonVL == VLOp, no point in reducing.\n");
return false;
}
@@ -1486,8 +1585,10 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
return true;
}
const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg());
- if (!MDT->dominates(VLMI, &MI))
+ if (!MDT->dominates(VLMI, &MI)) {
+ LLVM_DEBUG(dbgs() << " Abort due to VL not dominating.\n");
return false;
+ }
LLVM_DEBUG(
dbgs() << " Reduce VL from " << VLOp << " to "
<< printReg(CommonVL->getReg(), MRI->getTargetRegisterInfo())
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 050de3d58a2f..62651185137c 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -745,12 +745,24 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const {
if (PassthruReg && !isKnownSameDefs(PassthruReg, FalseReg))
return false;
+ std::optional<std::pair<unsigned, unsigned>> NeedsCommute;
+
// If True has a passthru operand then it needs to be the same as vmerge's
// False, since False will be used for the result's passthru operand.
Register TruePassthru = True.getOperand(True.getNumExplicitDefs()).getReg();
if (RISCVII::isFirstDefTiedToFirstUse(True.getDesc()) && TruePassthru &&
- !isKnownSameDefs(TruePassthru, FalseReg))
- return false;
+ !isKnownSameDefs(TruePassthru, FalseReg)) {
+ // If True's passthru != False, check if it uses False in another operand
+ // and try to commute it.
+ int OtherIdx = True.findRegisterUseOperandIdx(FalseReg, TRI);
+ if (OtherIdx == -1)
+ return false;
+ unsigned OpIdx1 = OtherIdx;
+ unsigned OpIdx2 = True.getNumExplicitDefs();
+ if (!TII->findCommutedOpIndices(True, OpIdx1, OpIdx2))
+ return false;
+ NeedsCommute = {OpIdx1, OpIdx2};
+ }
// Make sure it doesn't raise any observable fp exceptions, since changing the
// active elements will affect how fflags is set.
@@ -796,6 +808,14 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const {
if (!ensureDominates(MaskOp, True))
return false;
+ if (NeedsCommute) {
+ auto [OpIdx1, OpIdx2] = *NeedsCommute;
+ [[maybe_unused]] bool Commuted =
+ TII->commuteInstruction(True, /*NewMI=*/false, OpIdx1, OpIdx2);
+ assert(Commuted && "Failed to commute True?");
+ Info = RISCV::lookupMaskedIntrinsicByUnmasked(True.getOpcode());
+ }
+
True.setDesc(TII->get(Info->MaskedPseudo));
// Insert the mask operand.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index f658b67a4c2a..45e88fc94144 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -12,6 +12,7 @@
#include "SPIRVInstrInfo.h"
#include "SPIRV.h"
+#include "SPIRVSubtarget.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -22,7 +23,8 @@
using namespace llvm;
-SPIRVInstrInfo::SPIRVInstrInfo() : SPIRVGenInstrInfo() {}
+SPIRVInstrInfo::SPIRVInstrInfo(const SPIRVSubtarget &STI)
+ : SPIRVGenInstrInfo(STI) {}
bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index d58dddcd8da2..72d2243fba62 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -20,12 +20,13 @@
#include "SPIRVGenInstrInfo.inc"
namespace llvm {
+class SPIRVSubtarget;
class SPIRVInstrInfo : public SPIRVGenInstrInfo {
const SPIRVRegisterInfo RI;
public:
- SPIRVInstrInfo();
+ explicit SPIRVInstrInfo(const SPIRVSubtarget &STI);
const SPIRVRegisterInfo &getRegisterInfo() const { return RI; }
bool isHeaderInstr(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index f0b938d681db..8d10cd0ffb3d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -637,8 +637,8 @@ let isReturn = 1, hasDelaySlot = 0, isBarrier = 0, isTerminator = 1, isNotDuplic
def OpReturnValue: Op<254, (outs), (ins ID:$ret), "OpReturnValue $ret">;
def OpUnreachable: SimpleOp<"OpUnreachable", 255>;
}
-def OpLifetimeStart: Op<256, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStart $ptr, $sz">;
-def OpLifetimeStop: Op<257, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStop $ptr, $sz">;
+def OpLifetimeStart: Op<256, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStart $ptr $sz">;
+def OpLifetimeStop: Op<257, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStop $ptr $sz">;
def OpDemoteToHelperInvocation: SimpleOp<"OpDemoteToHelperInvocation", 5380>;
// 3.42.18 Atomic Instructions
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 98c7709acf93..3ad5528fab06 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -204,6 +204,9 @@ private:
bool selectIntegerDotExpansion(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ bool selectOpIsInf(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
+
template <bool Signed>
bool selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
@@ -2042,6 +2045,17 @@ bool SPIRVInstructionSelector::selectIntegerDotExpansion(
return Result;
}
+bool SPIRVInstructionSelector::selectOpIsInf(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+ MachineBasicBlock &BB = *I.getParent();
+ return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIsInf))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(I.getOperand(2).getReg())
+ .constrainAllUses(TII, TRI, RBI);
+}
+
template <bool Signed>
bool SPIRVInstructionSelector::selectDot4AddPacked(Register ResVReg,
const SPIRVType *ResType,
@@ -3183,6 +3197,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
return selectExtInst(ResVReg, ResType, I, GL::FaceForward);
case Intrinsic::spv_frac:
return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract);
+ case Intrinsic::spv_isinf:
+ return selectOpIsInf(ResVReg, ResType, I);
case Intrinsic::spv_normalize:
return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize);
case Intrinsic::spv_refract:
@@ -4276,9 +4292,11 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition(
uint32_t Binding = foldImm(HandleDef.getOperand(3), MRI);
uint32_t ArraySize = foldImm(HandleDef.getOperand(4), MRI);
Register IndexReg = HandleDef.getOperand(5).getReg();
- bool IsNonUniform = ArraySize > 1 && foldImm(HandleDef.getOperand(6), MRI);
+ // FIXME: The IsNonUniform flag needs to be set based on resource analysis.
+ // https://github.com/llvm/llvm-project/issues/155701
+ bool IsNonUniform = false;
std::string Name =
- getStringValueFromReg(HandleDef.getOperand(7).getReg(), *MRI);
+ getStringValueFromReg(HandleDef.getOperand(6).getReg(), *MRI);
bool IsStructuredBuffer = ResType->getOpcode() == SPIRV::OpTypePointer;
MachineIRBuilder MIRBuilder(HandleDef);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 8039cf0c432f..b7e371d19086 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -124,7 +124,7 @@ getSymbolicOperandRequirements(SPIRV::OperandCategory::OperandCategory Category,
})) {
return {true,
{},
- ReqExts,
+ std::move(ReqExts),
VersionTuple(),
VersionTuple()}; // TODO: add versions to extensions.
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 55c9c4c5380b..1811492bf217 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -43,7 +43,7 @@ using Edge = std::pair<BasicBlock *, BasicBlock *>;
static void partialOrderVisit(BasicBlock &Start,
std::function<bool(BasicBlock *)> Op) {
PartialOrderingVisitor V(*Start.getParent());
- V.partialOrderVisit(Start, Op);
+ V.partialOrderVisit(Start, std::move(Op));
}
// Returns the exact convergence region in the tree defined by `Node` for which
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index 690493fb426b..5b746a1389af 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -53,9 +53,9 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const SPIRVTargetMachine &TM)
: SPIRVGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS),
- PointerSize(TM.getPointerSizeInBits(/* AS= */ 0)), InstrInfo(),
- FrameLowering(initSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
- TargetTriple(TT) {
+ PointerSize(TM.getPointerSizeInBits(/* AS= */ 0)),
+ InstrInfo(initSubtargetDependencies(CPU, FS)), FrameLowering(*this),
+ TLInfo(TM, *this), TargetTriple(TT) {
switch (TT.getSubArch()) {
case Triple::SPIRVSubArch_v10:
SPIRVVersion = VersionTuple(1, 0);
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index f1cd9b1ab07c..c3d60f3689e1 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -266,16 +266,47 @@ DecodeCoprocPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSIMM5(MCInst &Inst, unsigned insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn, uint64_t Address,
- const MCDisassembler *Decoder);
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const MCDisassembler *Decoder) {
+ return Decoder->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset,
+ Width, /*InstSize=*/4);
+}
+
+static DecodeStatus DecodeCall(MCInst &MI, unsigned insn, uint64_t Address,
+ const MCDisassembler *Decoder) {
+ int64_t CallOffset = SignExtend64(fieldFromInstruction(insn, 0, 30), 30) * 4;
+ if (!tryAddingSymbolicOperand(Address + CallOffset, false, Address, 0, 30, MI,
+ Decoder))
+ MI.addOperand(MCOperand::createImm(CallOffset));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSIMM5(MCInst &MI, unsigned insn, uint64_t Address,
+ const MCDisassembler *Decoder) {
+ assert(isUInt<5>(insn));
+ MI.addOperand(MCOperand::createImm(SignExtend64<5>(insn)));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn, uint64_t Address,
+ const MCDisassembler *Decoder) {
+ assert(isUInt<13>(insn));
+ MI.addOperand(MCOperand::createImm(SignExtend64<13>(insn)));
+ return MCDisassembler::Success;
+}
+
template <unsigned N>
-constexpr static DecodeStatus DecodeDisp(MCInst &MI, uint32_t ImmVal,
- uint64_t Address,
- const MCDisassembler *Decoder);
+static DecodeStatus DecodeDisp(MCInst &MI, uint32_t ImmVal, uint64_t Address,
+ const MCDisassembler *Decoder) {
+ int64_t BranchOffset = SignExtend64(ImmVal, N) * 4;
+ if (!tryAddingSymbolicOperand(Address + BranchOffset, true, Address, 0, N, MI,
+ Decoder))
+ MI.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
#include "SparcGenDisassemblerTables.inc"
/// Read four bytes from the ArrayRef and return 32 bit word.
@@ -321,45 +352,3 @@ DecodeStatus SparcDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
return Result;
}
-
-static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
- uint64_t Address, uint64_t Offset,
- uint64_t Width, MCInst &MI,
- const MCDisassembler *Decoder) {
- return Decoder->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset,
- Width, /*InstSize=*/4);
-}
-
-static DecodeStatus DecodeCall(MCInst &MI, unsigned insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- int64_t CallOffset = SignExtend64(fieldFromInstruction(insn, 0, 30), 30) * 4;
- if (!tryAddingSymbolicOperand(Address + CallOffset, false, Address, 0, 30, MI,
- Decoder))
- MI.addOperand(MCOperand::createImm(CallOffset));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeSIMM5(MCInst &MI, unsigned insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- assert(isUInt<5>(insn));
- MI.addOperand(MCOperand::createImm(SignExtend64<5>(insn)));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn, uint64_t Address,
- const MCDisassembler *Decoder) {
- assert(isUInt<13>(insn));
- MI.addOperand(MCOperand::createImm(SignExtend64<13>(insn)));
- return MCDisassembler::Success;
-}
-
-template <unsigned N>
-constexpr static DecodeStatus DecodeDisp(MCInst &MI, uint32_t ImmVal,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- int64_t BranchOffset = SignExtend64(ImmVal, N) * 4;
- if (!tryAddingSymbolicOperand(Address + BranchOffset, true, Address, 0, N, MI,
- Decoder))
- MI.addOperand(MCOperand::createImm(BranchOffset));
- return MCDisassembler::Success;
-}
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index fa07578e512b..9fa60ee5229b 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -81,8 +81,16 @@ static MCRegisterInfo *createSparcMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *
createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
if (CPU.empty())
- CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8";
- return createSparcMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+ CPU = TT.getArch() == Triple::sparcv9 ? "v9" : "v8";
+
+ MCSubtargetInfo *STI =
+ createSparcMCSubtargetInfoImpl(TT, CPU, /*TuneCPU=*/CPU, FS);
+ if (TT.isSPARC64() && !STI->hasFeature(Sparc::Feature64Bit)) {
+ FeatureBitset Features = STI->getFeatureBits();
+ STI->setFeatureBits(Features.set(Sparc::Feature64Bit));
+ }
+
+ return STI;
}
static MCTargetStreamer *
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index a7b0538d683b..b523366e6ada 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -28,6 +28,7 @@ class MCRegisterInfo;
class MCSubtargetInfo;
class MCTargetOptions;
class Target;
+class Triple;
MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
MCContext &Ctx);
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index cee671e34951..7137e5fbff4f 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -34,6 +34,9 @@ def FeatureNoFMULS
def FeatureV9
: SubtargetFeature<"v9", "IsV9", "true",
"Enable SPARC-V9 instructions">;
+def Feature64Bit : SubtargetFeature<"64bit", "Is64Bit", "true",
+ "Enable 64-bit mode", [FeatureV9]>;
+
def FeatureV8Plus
: SubtargetFeature<"v8plus", "IsV8Plus", "true",
"Enable V8+ mode, allowing use of 64-bit V9 instructions in 32-bit code">;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index d01218f573dc..2737cca62cd2 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1907,37 +1907,37 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
// Setup Runtime library names.
if (Subtarget->is64Bit() && !Subtarget->useSoftFloat()) {
- setLibcallImpl(RTLIB::ADD_F128, RTLIB::_Qp_add);
- setLibcallImpl(RTLIB::SUB_F128, RTLIB::_Qp_sub);
- setLibcallImpl(RTLIB::MUL_F128, RTLIB::_Qp_mul);
- setLibcallImpl(RTLIB::DIV_F128, RTLIB::_Qp_div);
- setLibcallImpl(RTLIB::SQRT_F128, RTLIB::_Qp_sqrt);
- setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::_Qp_qtoi);
- setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::_Qp_qtoui);
- setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::_Qp_itoq);
- setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::_Qp_uitoq);
- setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::_Qp_qtox);
- setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::_Qp_qtoux);
- setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::_Qp_xtoq);
- setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::_Qp_uxtoq);
- setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::_Qp_stoq);
- setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::_Qp_dtoq);
- setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::_Qp_qtos);
- setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::_Qp_qtod);
+ setLibcallImpl(RTLIB::ADD_F128, RTLIB::impl__Qp_add);
+ setLibcallImpl(RTLIB::SUB_F128, RTLIB::impl__Qp_sub);
+ setLibcallImpl(RTLIB::MUL_F128, RTLIB::impl__Qp_mul);
+ setLibcallImpl(RTLIB::DIV_F128, RTLIB::impl__Qp_div);
+ setLibcallImpl(RTLIB::SQRT_F128, RTLIB::impl__Qp_sqrt);
+ setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::impl__Qp_qtoi);
+ setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::impl__Qp_qtoui);
+ setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::impl__Qp_itoq);
+ setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::impl__Qp_uitoq);
+ setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::impl__Qp_qtox);
+ setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::impl__Qp_qtoux);
+ setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::impl__Qp_xtoq);
+ setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::impl__Qp_uxtoq);
+ setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::impl__Qp_stoq);
+ setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::impl__Qp_dtoq);
+ setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::impl__Qp_qtos);
+ setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::impl__Qp_qtod);
} else if (!Subtarget->useSoftFloat()) {
- setLibcallImpl(RTLIB::ADD_F128, RTLIB::_Q_add);
- setLibcallImpl(RTLIB::SUB_F128, RTLIB::_Q_sub);
- setLibcallImpl(RTLIB::MUL_F128, RTLIB::_Q_mul);
- setLibcallImpl(RTLIB::DIV_F128, RTLIB::_Q_div);
- setLibcallImpl(RTLIB::SQRT_F128, RTLIB::_Q_sqrt);
- setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::_Q_qtoi);
- setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::_Q_qtou);
- setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::_Q_itoq);
- setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::_Q_utoq);
- setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::_Q_stoq);
- setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::_Q_dtoq);
- setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::_Q_qtos);
- setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::_Q_qtod);
+ setLibcallImpl(RTLIB::ADD_F128, RTLIB::impl__Q_add);
+ setLibcallImpl(RTLIB::SUB_F128, RTLIB::impl__Q_sub);
+ setLibcallImpl(RTLIB::MUL_F128, RTLIB::impl__Q_mul);
+ setLibcallImpl(RTLIB::DIV_F128, RTLIB::impl__Q_div);
+ setLibcallImpl(RTLIB::SQRT_F128, RTLIB::impl__Q_sqrt);
+ setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::impl__Q_qtoi);
+ setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::impl__Q_qtou);
+ setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::impl__Q_itoq);
+ setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::impl__Q_utoq);
+ setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::impl__Q_stoq);
+ setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::impl__Q_dtoq);
+ setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::impl__Q_qtos);
+ setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::impl__Q_qtod);
}
}
@@ -3510,7 +3510,7 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
// Override to enable LOAD_STACK_GUARD lowering on Linux.
bool SparcTargetLowering::useLoadStackGuardNode(const Module &M) const {
- if (!Subtarget->isTargetLinux())
+ if (!Subtarget->getTargetTriple().isOSLinux())
return TargetLowering::useLoadStackGuardNode(M);
return true;
}
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index a7fbbd4044c1..cd0f64991298 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -37,8 +37,8 @@ static cl::opt<unsigned>
// Pin the vtable to this file.
void SparcInstrInfo::anchor() {}
-SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
- : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(),
+SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST)
+ : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(),
Subtarget(ST) {}
/// isLoadFromStackSlot - If the specified machine instruction is a direct
@@ -643,7 +643,7 @@ unsigned SparcInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
switch (MI.getOpcode()) {
case TargetOpcode::LOAD_STACK_GUARD: {
- assert(Subtarget.isTargetLinux() &&
+ assert(Subtarget.getTargetTriple().isOSLinux() &&
"Only Linux target is expected to contain LOAD_STACK_GUARD");
// offsetof(tcbhead_t, stack_guard) from sysdeps/sparc/nptl/tls.h in glibc.
const int64_t Offset = Subtarget.is64Bit() ? 0x28 : 0x14;
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h
index 1feb12ba2fda..01d020473494 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.h
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -40,7 +40,7 @@ class SparcInstrInfo : public SparcGenInstrInfo {
const SparcSubtarget& Subtarget;
virtual void anchor();
public:
- explicit SparcInstrInfo(SparcSubtarget &ST);
+ explicit SparcInstrInfo(const SparcSubtarget &ST);
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 1a32eafb0e83..53972d6c105a 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -1785,22 +1785,22 @@ let Predicates = [HasV9], Uses = [ASR3], Constraints = "$swap = $rd" in
// as inline assembler-supported instructions.
let Predicates = [HasUMAC_SMAC], Defs = [Y, ASR18], Uses = [Y, ASR18] in {
def SMACrr : F3_1<2, 0b111111,
- (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
"smac $rs1, $rs2, $rd",
[], IIC_smac_umac>;
def SMACri : F3_2<2, 0b111111,
- (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
"smac $rs1, $simm13, $rd",
[], IIC_smac_umac>;
def UMACrr : F3_1<2, 0b111110,
- (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
"umac $rs1, $rs2, $rd",
[], IIC_smac_umac>;
def UMACri : F3_2<2, 0b111110,
- (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+ (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
"umac $rs1, $simm13, $rd",
[], IIC_smac_umac>;
}
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index e42df1d68613..005930834a0c 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -28,10 +28,11 @@ void SparcSubtarget::anchor() { }
SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(
StringRef CPU, StringRef TuneCPU, StringRef FS) {
+ const Triple &TT = getTargetTriple();
// Determine default and user specified characteristics
std::string CPUName = std::string(CPU);
if (CPUName.empty())
- CPUName = (Is64Bit) ? "v9" : "v8";
+ CPUName = TT.isSPARC64() ? "v9" : "v8";
if (TuneCPU.empty())
TuneCPU = CPUName;
@@ -39,6 +40,12 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(
// Parse features string.
ParseSubtargetFeatures(CPUName, TuneCPU, FS);
+ if (!Is64Bit && TT.isSPARC64()) {
+ FeatureBitset Features = getFeatureBits();
+ setFeatureBits(Features.set(Sparc::Feature64Bit));
+ Is64Bit = true;
+ }
+
// Popc is a v9-only instruction.
if (!IsV9)
UsePopc = false;
@@ -47,11 +54,9 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(
}
SparcSubtarget::SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU,
- const StringRef &FS, const TargetMachine &TM,
- bool is64Bit)
+ const StringRef &FS, const TargetMachine &TM)
: SparcGenSubtargetInfo(TM.getTargetTriple(), CPU, TuneCPU, FS),
ReserveRegister(TM.getMCRegisterInfo()->getNumRegs()),
- TargetTriple(TM.getTargetTriple()), Is64Bit(is64Bit),
InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
TLInfo(TM, *this), FrameLowering(*this) {
TSInfo = std::make_unique<SparcSelectionDAGInfo>();
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h
index 5785c199f44b..b1decca0a4f0 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -34,11 +34,8 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
// register.
BitVector ReserveRegister;
- Triple TargetTriple;
virtual void anchor();
- bool Is64Bit;
-
#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
bool ATTRIBUTE = DEFAULT;
#include "SparcGenSubtargetInfo.inc"
@@ -50,7 +47,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
public:
SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU,
- const StringRef &FS, const TargetMachine &TM, bool is64bit);
+ const StringRef &FS, const TargetMachine &TM);
~SparcSubtarget() override;
@@ -80,8 +77,6 @@ public:
StringRef TuneCPU,
StringRef FS);
- bool is64Bit() const { return Is64Bit; }
-
/// The 64-bit ABI uses biased stack and frame pointers, so the stack frame
/// of the current function is the area from [%sp+BIAS] to [%fp+BIAS].
int64_t getStackPointerBias() const {
@@ -96,8 +91,6 @@ public:
/// returns adjusted framesize which includes space for register window
/// spills and arguments.
int getAdjustedFrameSize(int stackSize) const;
-
- bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index 52076a6b4dd2..754c8f63ca4e 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -38,7 +38,9 @@ static cl::opt<bool>
BranchRelaxation("sparc-enable-branch-relax", cl::Hidden, cl::init(true),
cl::desc("Relax out of range conditional branches"));
-static std::string computeDataLayout(const Triple &T, bool is64Bit) {
+static std::string computeDataLayout(const Triple &T) {
+ const bool is64Bit = T.isSPARC64();
+
// Sparc is typically big endian, but some are little.
std::string Ret = T.getArch() == Triple::sparcel ? "e" : "E";
Ret += "-m:e";
@@ -107,15 +109,14 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, const Triple &TT,
const TargetOptions &Options,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
- CodeGenOptLevel OL, bool JIT,
- bool is64bit)
+ CodeGenOptLevel OL, bool JIT)
: CodeGenTargetMachineImpl(
- T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options,
+ T, computeDataLayout(TT), TT, CPU, FS, Options,
getEffectiveRelocModel(RM),
- getEffectiveSparcCodeModel(CM, getEffectiveRelocModel(RM), is64bit,
- JIT),
+ getEffectiveSparcCodeModel(CM, getEffectiveRelocModel(RM),
+ TT.isSPARC64(), JIT),
OL),
- TLOF(std::make_unique<SparcELFTargetObjectFile>()), is64Bit(is64bit) {
+ TLOF(std::make_unique<SparcELFTargetObjectFile>()) {
initAsmInfo();
}
@@ -148,8 +149,7 @@ SparcTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = std::make_unique<SparcSubtarget>(CPU, TuneCPU, FS, *this,
- this->is64Bit);
+ I = std::make_unique<SparcSubtarget>(CPU, TuneCPU, FS, *this);
}
return I.get();
}
@@ -212,7 +212,7 @@ SparcV8TargetMachine::SparcV8TargetMachine(const Target &T, const Triple &TT,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
CodeGenOptLevel OL, bool JIT)
- : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, false) {}
+ : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT) {}
void SparcV9TargetMachine::anchor() { }
@@ -222,7 +222,7 @@ SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, const Triple &TT,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
CodeGenOptLevel OL, bool JIT)
- : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, true) {}
+ : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT) {}
void SparcelTargetMachine::anchor() {}
@@ -232,4 +232,4 @@ SparcelTargetMachine::SparcelTargetMachine(const Target &T, const Triple &TT,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
CodeGenOptLevel OL, bool JIT)
- : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, false) {}
+ : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT) {}
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.h b/llvm/lib/Target/Sparc/SparcTargetMachine.h
index 9a226a47671b..e7d038c5779d 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.h
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.h
@@ -23,7 +23,6 @@ namespace llvm {
class SparcTargetMachine : public CodeGenTargetMachineImpl {
std::unique_ptr<TargetLoweringObjectFile> TLOF;
- bool is64Bit;
mutable StringMap<std::unique_ptr<SparcSubtarget>> SubtargetMap;
public:
@@ -31,7 +30,7 @@ public:
StringRef FS, const TargetOptions &Options,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
- bool JIT, bool is64bit);
+ bool JIT);
~SparcTargetMachine() override;
const SparcSubtarget *getSubtargetImpl(const Function &F) const override;
diff --git a/llvm/lib/Target/SystemZ/SystemZFeatures.td b/llvm/lib/Target/SystemZ/SystemZFeatures.td
index 2c48da8320fb..4ccc3d3079fc 100644
--- a/llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ b/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -196,7 +196,7 @@ def FeatureVector : SystemZFeature<
>;
def FeatureNoVector : SystemZMissingFeature<"Vector">;
-def NoVecHwMode : HwMode<"-vector", [FeatureNoVector]>;
+def NoVecHwMode : HwMode<[FeatureNoVector]>;
def Arch11NewFeatures : SystemZFeatureList<[
FeatureLoadAndZeroRightmostByte,
@@ -426,4 +426,3 @@ def Arch9UnsupportedFeatures
: SystemZFeatureAdd<Arch10UnsupportedFeatures.List, Arch10NewFeatures.List>;
def Arch8UnsupportedFeatures
: SystemZFeatureAdd<Arch9UnsupportedFeatures.List, Arch9NewFeatures.List>;
-
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c73dc3021eb4..3b7d11a318dc 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -287,6 +287,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// Additional instructions available with z17.
if (Subtarget.hasVectorEnhancements3()) {
setOperationAction(ISD::ABS, MVT::i128, Legal);
+
+ setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
+ MVT::i128, Legal);
}
}
@@ -492,6 +495,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
// and inverting the result as necessary.
setOperationAction(ISD::SETCC, VT, Custom);
+
+ setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT,
+ Legal);
}
}
@@ -6719,6 +6725,14 @@ SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+ if (ShiftAmt > 120) {
+ // For N in 121..128, fshl N == fshr (128 - N), and for 1 <= N < 8
+ // SHR_DOUBLE_BIT emits fewer instructions.
+ SDValue Val =
+ DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Op0, Op1,
+ DAG.getTargetConstant(128 - ShiftAmt, DL, MVT::i32));
+ return DAG.getBitcast(MVT::i128, Val);
+ }
SmallVector<int, 16> Mask(16);
for (unsigned Elt = 0; Elt < 16; Elt++)
Mask[Elt] = (ShiftAmt >> 3) + Elt;
@@ -6742,13 +6756,21 @@ SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const {
// i128 FSHR with a constant amount that is a multiple of 8 can be
// implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
// facility, FSHR with a constant amount less than 8 can be implemented
- // via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a
+ // via SHR_DOUBLE_BIT, and FSHR with other constant amounts by a
// combination of the two.
if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+ if (ShiftAmt > 120) {
+ // For N in 121..128, fshr N == fshl (128 - N), and for 1 <= N < 8
+ // SHL_DOUBLE_BIT emits fewer instructions.
+ SDValue Val =
+ DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Op0, Op1,
+ DAG.getTargetConstant(128 - ShiftAmt, DL, MVT::i32));
+ return DAG.getBitcast(MVT::i128, Val);
+ }
SmallVector<int, 16> Mask(16);
for (unsigned Elt = 0; Elt < 16; Elt++)
Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index d0a549518cc4..82415f412509 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2646,28 +2646,24 @@ class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls>
: InstRIb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget16:$RI2),
mnemonic#"\t$R1, $RI2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BranchUnaryRIL<string mnemonic, bits<12> opcode, RegisterOperand cls>
: InstRILb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget32:$RI2),
mnemonic#"\t$R1, $RI2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BranchUnaryRR<string mnemonic, bits<8> opcode, RegisterOperand cls>
: InstRR<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
mnemonic#"\t$R1, $R2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BranchUnaryRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
: InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
mnemonic#"\t$R1, $R2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BranchUnaryRX<string mnemonic, bits<8> opcode, RegisterOperand cls>
@@ -2675,7 +2671,6 @@ class BranchUnaryRX<string mnemonic, bits<8> opcode, RegisterOperand cls>
(ins cls:$R1src, (bdxaddr12only $B2, $D2, $X2):$XBD2),
mnemonic#"\t$R1, $XBD2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BranchUnaryRXY<string mnemonic, bits<16> opcode, RegisterOperand cls>
@@ -2683,14 +2678,12 @@ class BranchUnaryRXY<string mnemonic, bits<16> opcode, RegisterOperand cls>
(ins cls:$R1src, (bdxaddr20only $B2, $D2, $X2):$XBD2),
mnemonic#"\t$R1, $XBD2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BranchBinaryRSI<string mnemonic, bits<8> opcode, RegisterOperand cls>
: InstRSI<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, brtarget16:$RI2),
mnemonic#"\t$R1, $R3, $RI2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls>
@@ -2698,7 +2691,6 @@ class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls>
(ins cls:$R1src, cls:$R3, brtarget16:$RI2),
mnemonic#"\t$R1, $R3, $RI2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls>
@@ -2706,7 +2698,6 @@ class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls>
(ins cls:$R1src, cls:$R3, (bdaddr12only $B2, $D2):$BD2),
mnemonic#"\t$R1, $R3, $BD2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BranchBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
@@ -2715,7 +2706,6 @@ class BranchBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
(ins cls:$R1src, cls:$R3, (bdaddr20only $B2, $D2):$BD2),
mnemonic#"\t$R1, $R3, $BD2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class LoadMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
@@ -3116,7 +3106,6 @@ class UnaryTiedRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
: InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src),
mnemonic#"\t$R1", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let R2 = 0;
}
@@ -3125,7 +3114,6 @@ class UnaryMemRRFc<string mnemonic, bits<16> opcode,
: InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src),
mnemonic#"\t$R1, $R2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let M3 = 0;
}
@@ -3163,7 +3151,6 @@ class CondUnaryRSY<string mnemonic, bits<16> opcode,
(z_select_ccmask (operator bdaddr20only:$BD2), cls:$R1src,
cond4:$valid, cond4:$M3))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let AccessBytes = bytes;
let CCMaskLast = 1;
@@ -3184,7 +3171,6 @@ class AsmCondUnaryRSY<string mnemonic, bits<16> opcode,
let mayLoad = 1;
let AccessBytes = bytes;
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
// Like CondUnaryRSY, but with a fixed CC mask.
@@ -3194,7 +3180,6 @@ class FixedCondUnaryRSY<CondVariant V, string mnemonic, bits<16> opcode,
: InstRSYb<opcode, (outs cls:$R1), (ins cls:$R1src, (mode $B2, $D2):$BD2),
mnemonic#V.suffix#"\t$R1, $BD2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let AccessBytes = bytes;
let isAsmParserOnly = V.alternate;
@@ -3439,7 +3424,6 @@ class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode,
: InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
mnemonic#"\t$R1, $R2", []> {
let Constraints = "$R1 = $R1src, $R2 = $R2src";
- let DisableEncoding = "$R1src, $R2src";
}
class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode,
@@ -3447,7 +3431,6 @@ class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode,
: InstRRE<opcode, (outs cls2:$R2), (ins cls1:$R1, cls2:$R2src),
mnemonic#"\t$R1, $R2", []> {
let Constraints = "$R2 = $R2src";
- let DisableEncoding = "$R2src";
}
class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode,
@@ -3455,7 +3438,6 @@ class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode,
: InstRRE<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
mnemonic#"\t$R1, $R2", []> {
let Constraints = "$R1 = $R1src, $R2 = $R2src";
- let DisableEncoding = "$R1src, $R2src";
}
class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode,
@@ -3463,7 +3445,6 @@ class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode,
: InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
mnemonic#"\t$R1, $R2", []> {
let Constraints = "$R1 = $R1src, $R2 = $R2src";
- let DisableEncoding = "$R1src, $R2src";
let M3 = 0;
}
@@ -3475,7 +3456,6 @@ class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
let OpKey = mnemonic#cls1;
let OpType = "reg";
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -3486,7 +3466,6 @@ class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let OpKey = mnemonic#cls1;
let OpType = "reg";
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BinaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -3565,7 +3544,6 @@ class BinaryMemRRFc<string mnemonic, bits<16> opcode,
: InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
mnemonic#"\t$R1, $R2, $M3", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode,
@@ -3594,7 +3572,6 @@ class CondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
[(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src,
cond4:$valid, cond4:$M3))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let CCMaskLast = 1;
let NumOpsKey = !subst("loc", "sel", mnemonic);
let NumOpsValue = "2";
@@ -3610,7 +3587,6 @@ class AsmCondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
(ins cls1:$R1src, cls2:$R2, imm32zx4:$M3),
mnemonic#"\t$R1, $R2, $M3", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
// Like CondBinaryRRF, but with a fixed CC mask.
@@ -3619,7 +3595,6 @@ class FixedCondBinaryRRF<CondVariant V, string mnemonic, bits<16> opcode,
: InstRRFc<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
mnemonic#V.suffix#"\t$R1, $R2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let isAsmParserOnly = V.alternate;
let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
@@ -3678,7 +3653,6 @@ class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
mnemonic#"\t$R1, $I2",
[(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -3707,7 +3681,6 @@ class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
[(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
cond4:$valid, cond4:$M3))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let CCMaskLast = 1;
}
@@ -3719,7 +3692,6 @@ class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
(ins cls:$R1src, imm:$I2, imm32zx4:$M3),
mnemonic#"\t$R1, $I2, $M3", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
// Like CondBinaryRIE, but with a fixed CC mask.
@@ -3728,7 +3700,6 @@ class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode,
: InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
mnemonic#V.suffix#"\t$R1, $I2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let isAsmParserOnly = V.alternate;
let AsmVariantName = V.asmvariant;
let M3 = V.ccmask;
@@ -3747,7 +3718,6 @@ class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
mnemonic#"\t$R1, $I2",
[(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
@@ -3758,7 +3728,6 @@ class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
[(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> {
let R3 = 0;
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class BinaryRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -3794,7 +3763,6 @@ class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
let OpKey = mnemonic#"r"#cls;
let OpType = "mem";
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let AccessBytes = bytes;
}
@@ -3809,7 +3777,6 @@ class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let OpKey = mnemonic#"r"#cls;
let OpType = "mem";
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let AccessBytes = bytes;
let M3 = 0;
@@ -3838,7 +3805,6 @@ class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let OpKey = mnemonic#"r"#cls;
let OpType = "mem";
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let AccessBytes = bytes;
}
@@ -4500,7 +4466,6 @@ class SideEffectTernaryMemMemRRFa<string mnemonic, bits<16> opcode,
(ins cls1:$R1src, cls2:$R2src, cls3:$R3),
mnemonic#"\t$R1, $R2, $R3", []> {
let Constraints = "$R1 = $R1src, $R2 = $R2src";
- let DisableEncoding = "$R1src, $R2src";
let M4 = 0;
}
@@ -4520,7 +4485,6 @@ class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
(ins cls1:$R1src, cls2:$R2src, cls3:$R3src),
mnemonic#"\t$R1, $R3, $R2", []> {
let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src";
- let DisableEncoding = "$R1src, $R2src, $R3src";
let M4 = 0;
}
@@ -4544,7 +4508,6 @@ class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
(ins cls1:$R1src, cls2:$R2src, imm:$M3),
mnemonic#"\t$R1, $R2, $M3", []> {
let Constraints = "$R1 = $R1src, $R2 = $R2src";
- let DisableEncoding = "$R1src, $R2src";
}
multiclass SideEffectTernaryMemMemRRFcOpt<string mnemonic, bits<16> opcode,
@@ -4574,7 +4537,6 @@ class TernaryRRFb<string mnemonic, bits<16> opcode,
(ins cls1:$R1src, cls2:$R2, imm32zx4:$M4),
mnemonic#"\t$R1, $R3, $R2, $M4", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
@@ -4591,7 +4553,6 @@ class TernaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let OpKey = mnemonic#cls;
let OpType = "reg";
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
class TernaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
@@ -4601,7 +4562,6 @@ class TernaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
mnemonic#"\t$R1, $M3, $BD2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let AccessBytes = bytes;
}
@@ -4613,7 +4573,6 @@ class TernaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
mnemonic#"\t$R1, $M3, $BD2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let AccessBytes = bytes;
}
@@ -4646,7 +4605,6 @@ class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode,
(ins cls1:$R1src, cls2:$R3src, (shift12only $B2, $D2):$BD2),
mnemonic#"\t$R1, $R3, $BD2", []> {
let Constraints = "$R1 = $R1src, $R3 = $R3src";
- let DisableEncoding = "$R1src, $R3src";
}
class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
@@ -4655,7 +4613,6 @@ class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
(ins cls1:$R1src, cls2:$R3src, (shift20only $B2, $D2):$BD2),
mnemonic#"\t$R1, $R3, $BD2", []> {
let Constraints = "$R1 = $R1src, $R3 = $R3src";
- let DisableEncoding = "$R1src, $R3src";
}
class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -4669,7 +4626,6 @@ class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let OpKey = mnemonic#"r"#cls;
let OpType = "mem";
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let AccessBytes = bytes;
}
@@ -4681,7 +4637,6 @@ class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
imm:$I2, index:$M3))]> {
let Constraints = "$V1 = $V1src";
- let DisableEncoding = "$V1src";
}
class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -4893,7 +4848,6 @@ class TernaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
cls:$R3,
shift12only:$BD2))]> {
let Constraints = "$V1 = $V1src";
- let DisableEncoding = "$V1src";
let M4 = type;
}
@@ -4913,7 +4867,6 @@ class TernaryVRSbGeneric<string mnemonic, bits<16> opcode>
imm32zx4:$M4),
mnemonic#"\t$V1, $R3, $BD2, $M4", []> {
let Constraints = "$V1 = $V1src";
- let DisableEncoding = "$V1src";
}
class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
@@ -4922,7 +4875,6 @@ class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
(ins VR128:$V1src, (bdvaddr12only $B2, $D2, $V2):$VBD2, index:$M3),
mnemonic#"\t$V1, $VBD2, $M3", []> {
let Constraints = "$V1 = $V1src";
- let DisableEncoding = "$V1src";
let mayLoad = 1;
let AccessBytes = bytes;
}
@@ -4936,7 +4888,6 @@ class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
bdxaddr12only:$XBD2,
index:$M3))]> {
let Constraints = "$V1 = $V1src";
- let DisableEncoding = "$V1src";
let mayLoad = 1;
let AccessBytes = bytes;
}
@@ -4951,7 +4902,6 @@ class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operato
(tr2.vt tr2.op:$V3),
imm32zx8_timm:$I4))]> {
let Constraints = "$V1 = $V1src";
- let DisableEncoding = "$V1src";
let M5 = type;
}
@@ -4961,7 +4911,6 @@ class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode>
imm32zx8:$I4, imm32zx4:$M5),
mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []> {
let Constraints = "$V1 = $V1src";
- let DisableEncoding = "$V1src";
}
class QuaternaryVRIf<string mnemonic, bits<16> opcode>
@@ -5087,7 +5036,6 @@ class CmpSwapRRE<string mnemonic, bits<16> opcode,
: InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
mnemonic#"\t$R1, $R2", []> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let mayStore = 1;
}
@@ -5099,7 +5047,6 @@ class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
mnemonic#"\t$R1, $R3, $BD2",
[(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let mayStore = 1;
}
@@ -5111,7 +5058,6 @@ class CmpSwapRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
mnemonic#"\t$R1, $R3, $BD2",
[(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let mayStore = 1;
}
@@ -5128,7 +5074,7 @@ multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
multiclass RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
RegisterOperand cls2, bits<8> I3Or = 0, bits<8> I4Or = 0> {
- let Constraints = "$R1 = $R1src", DisableEncoding = "$R1src" in {
+ let Constraints = "$R1 = $R1src" in {
def "" : InstRIEf<opcode, (outs cls1:$R1),
(ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
imm32zx8:$I5),
@@ -5328,7 +5274,6 @@ class CondBinaryRRFPseudo<string mnemonic, RegisterOperand cls1,
[(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src,
cond4:$valid, cond4:$M3))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let CCMaskLast = 1;
let NumOpsKey = !subst("loc", "sel", mnemonic);
let NumOpsValue = "2";
@@ -5359,7 +5304,6 @@ class CondBinaryRIEPseudo<RegisterOperand cls, ImmOpWithPattern imm>
[(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
cond4:$valid, cond4:$M3))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let CCMaskLast = 1;
}
@@ -5374,7 +5318,6 @@ class CondUnaryRSYPseudo<string mnemonic, SDPatternOperator operator,
(z_select_ccmask (operator mode:$BD2), cls:$R1src,
cond4:$valid, cond4:$R3))]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
let mayLoad = 1;
let AccessBytes = bytes;
let CCMaskLast = 1;
@@ -5414,7 +5357,6 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
imm32zx8:$I5),
[]> {
let Constraints = "$R1 = $R1src";
- let DisableEncoding = "$R1src";
}
// Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 783f86aecce4..2e21f27c9032 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -59,8 +59,8 @@ static uint64_t allOnes(unsigned int Count) {
// Pin the vtable to this file.
void SystemZInstrInfo::anchor() {}
-SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti)
- : SystemZGenInstrInfo(-1, -1),
+SystemZInstrInfo::SystemZInstrInfo(const SystemZSubtarget &sti)
+ : SystemZGenInstrInfo(sti, -1, -1),
RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(),
sti.getHwMode()),
STI(sti) {}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 8b82af61e669..7b9ad7b87a14 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -184,7 +184,7 @@ MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
class SystemZInstrInfo : public SystemZGenInstrInfo {
const SystemZRegisterInfo RI;
- SystemZSubtarget &STI;
+ const SystemZSubtarget &STI;
void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
@@ -225,7 +225,7 @@ protected:
unsigned CommuteOpIdx2) const override;
public:
- explicit SystemZInstrInfo(SystemZSubtarget &STI);
+ explicit SystemZInstrInfo(const SystemZSubtarget &STI);
// Override TargetInstrInfo.
Register isLoadFromStackSlot(const MachineInstr &MI,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 10de8b05cf45..479bab5ce62b 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -680,41 +680,41 @@ let Predicates = [FeatureVector] in {
let isCommutable = 1 in {
// Maximum.
def VMX : BinaryVRRcGeneric<"vmx", 0xE7FF>;
- def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>;
- def VMXH : BinaryVRRc<"vmxh", 0xE7FF, null_frag, v128h, v128h, 1>;
- def VMXF : BinaryVRRc<"vmxf", 0xE7FF, null_frag, v128f, v128f, 2>;
- def VMXG : BinaryVRRc<"vmxg", 0xE7FF, null_frag, v128g, v128g, 3>;
+ def VMXB : BinaryVRRc<"vmxb", 0xE7FF, smax, v128b, v128b, 0>;
+ def VMXH : BinaryVRRc<"vmxh", 0xE7FF, smax, v128h, v128h, 1>;
+ def VMXF : BinaryVRRc<"vmxf", 0xE7FF, smax, v128f, v128f, 2>;
+ def VMXG : BinaryVRRc<"vmxg", 0xE7FF, smax, v128g, v128g, 3>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMXQ : BinaryVRRc<"vmxq", 0xE7FF, null_frag, v128q, v128q, 4>;
+ def VMXQ : BinaryVRRc<"vmxq", 0xE7FF, smax, v128q, v128q, 4>;
// Maximum logical.
def VMXL : BinaryVRRcGeneric<"vmxl", 0xE7FD>;
- def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, null_frag, v128b, v128b, 0>;
- def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, null_frag, v128h, v128h, 1>;
- def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, null_frag, v128f, v128f, 2>;
- def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, null_frag, v128g, v128g, 3>;
+ def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, umax, v128b, v128b, 0>;
+ def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, umax, v128h, v128h, 1>;
+ def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, umax, v128f, v128f, 2>;
+ def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, umax, v128g, v128g, 3>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMXLQ : BinaryVRRc<"vmxlq", 0xE7FD, null_frag, v128q, v128q, 4>;
+ def VMXLQ : BinaryVRRc<"vmxlq", 0xE7FD, umax, v128q, v128q, 4>;
}
let isCommutable = 1 in {
// Minimum.
def VMN : BinaryVRRcGeneric<"vmn", 0xE7FE>;
- def VMNB : BinaryVRRc<"vmnb", 0xE7FE, null_frag, v128b, v128b, 0>;
- def VMNH : BinaryVRRc<"vmnh", 0xE7FE, null_frag, v128h, v128h, 1>;
- def VMNF : BinaryVRRc<"vmnf", 0xE7FE, null_frag, v128f, v128f, 2>;
- def VMNG : BinaryVRRc<"vmng", 0xE7FE, null_frag, v128g, v128g, 3>;
+ def VMNB : BinaryVRRc<"vmnb", 0xE7FE, smin, v128b, v128b, 0>;
+ def VMNH : BinaryVRRc<"vmnh", 0xE7FE, smin, v128h, v128h, 1>;
+ def VMNF : BinaryVRRc<"vmnf", 0xE7FE, smin, v128f, v128f, 2>;
+ def VMNG : BinaryVRRc<"vmng", 0xE7FE, smin, v128g, v128g, 3>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMNQ : BinaryVRRc<"vmnq", 0xE7FE, null_frag, v128q, v128q, 4>;
+ def VMNQ : BinaryVRRc<"vmnq", 0xE7FE, smin, v128q, v128q, 4>;
// Minimum logical.
def VMNL : BinaryVRRcGeneric<"vmnl", 0xE7FC>;
- def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, null_frag, v128b, v128b, 0>;
- def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, null_frag, v128h, v128h, 1>;
- def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, null_frag, v128f, v128f, 2>;
- def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>;
+ def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, umin, v128b, v128b, 0>;
+ def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, umin, v128h, v128h, 1>;
+ def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, umin, v128f, v128f, 2>;
+ def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, umin, v128g, v128g, 3>;
let Predicates = [FeatureVectorEnhancements3] in
- def VMNLQ : BinaryVRRc<"vmnlq", 0xE7FC, null_frag, v128q, v128q, 4>;
+ def VMNLQ : BinaryVRRc<"vmnlq", 0xE7FC, umin, v128q, v128q, 4>;
}
let isCommutable = 1 in {
@@ -1250,54 +1250,45 @@ defm : IntegerAbsoluteVectorOps<v8i16, VLCH, VLPH, 15>;
defm : IntegerAbsoluteVectorOps<v4i32, VLCF, VLPF, 31>;
defm : IntegerAbsoluteVectorOps<v2i64, VLCG, VLPG, 63>;
-// Instantiate minimum- and maximum-related patterns for TYPE. CMPH is the
-// signed or unsigned "set if greater than" comparison instruction and
-// MIN and MAX are the associated minimum and maximum instructions.
-multiclass IntegerMinMaxVectorOps<ValueType type, SDPatternOperator cmph,
- Instruction min, Instruction max> {
- let Predicates = [FeatureVector] in {
- def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$x, VR128:$y)),
- (max VR128:$x, VR128:$y)>;
- def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$y, VR128:$x)),
- (min VR128:$x, VR128:$y)>;
- def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
- VR128:$x, VR128:$y)),
- (min VR128:$x, VR128:$y)>;
- def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
- VR128:$y, VR128:$x)),
- (max VR128:$x, VR128:$y)>;
- }
+// Instantiate packs/packu: recognize a saturating truncation and convert
+// into the corresponding packs/packu instruction.
+multiclass SignedSaturatingTruncate<ValueType input, ValueType output,
+ Instruction packs> {
+ def : Pat<
+ (output (z_pack
+ (smin (smax (input VR128:$a), ssat_trunc_min_vec), ssat_trunc_max_vec),
+ (smin (smax (input VR128:$b), ssat_trunc_min_vec), ssat_trunc_max_vec)
+ )),
+ (packs VR128:$a, VR128:$b)
+ >;
+
+ def : Pat<
+ (output (z_pack
+ (smax (smin (input VR128:$a), ssat_trunc_max_vec), ssat_trunc_min_vec),
+ (smax (smin (input VR128:$b), ssat_trunc_max_vec), ssat_trunc_min_vec)
+ )),
+ (packs VR128:$a, VR128:$b)
+ >;
}
-// Signed min/max.
-defm : IntegerMinMaxVectorOps<v16i8, z_vicmph, VMNB, VMXB>;
-defm : IntegerMinMaxVectorOps<v8i16, z_vicmph, VMNH, VMXH>;
-defm : IntegerMinMaxVectorOps<v4i32, z_vicmph, VMNF, VMXF>;
-defm : IntegerMinMaxVectorOps<v2i64, z_vicmph, VMNG, VMXG>;
-
-let Predicates = [FeatureVectorEnhancements3] in {
- def : Pat<(i128 (or (and VR128:$x, (z_vicmph VR128:$x, VR128:$y)),
- (and VR128:$y, (not (z_vicmph VR128:$x, VR128:$y))))),
- (VMXQ VR128:$x, VR128:$y)>;
- def : Pat<(i128 (or (and VR128:$y, (z_vicmph VR128:$x, VR128:$y)),
- (and VR128:$x, (not (z_vicmph VR128:$x, VR128:$y))))),
- (VMNQ VR128:$x, VR128:$y)>;
+defm : SignedSaturatingTruncate<v8i16, v16i8, VPKSH>;
+defm : SignedSaturatingTruncate<v4i32, v8i16, VPKSF>;
+defm : SignedSaturatingTruncate<v2i64, v4i32, VPKSG>;
+
+multiclass UnsignedSaturatingTruncate<ValueType input, ValueType output,
+ Instruction packu> {
+ def : Pat<
+ (output (z_pack
+ (umin (input VR128:$a), usat_trunc_max_vec),
+ (umin (input VR128:$b), usat_trunc_max_vec)
+ )),
+ (packu VR128:$a, VR128:$b)
+ >;
}
-// Unsigned min/max.
-defm : IntegerMinMaxVectorOps<v16i8, z_vicmphl, VMNLB, VMXLB>;
-defm : IntegerMinMaxVectorOps<v8i16, z_vicmphl, VMNLH, VMXLH>;
-defm : IntegerMinMaxVectorOps<v4i32, z_vicmphl, VMNLF, VMXLF>;
-defm : IntegerMinMaxVectorOps<v2i64, z_vicmphl, VMNLG, VMXLG>;
-
-let Predicates = [FeatureVectorEnhancements3] in {
- def : Pat<(i128 (or (and VR128:$x, (z_vicmphl VR128:$x, VR128:$y)),
- (and VR128:$y, (not (z_vicmphl VR128:$x, VR128:$y))))),
- (VMXLQ VR128:$x, VR128:$y)>;
- def : Pat<(i128 (or (and VR128:$y, (z_vicmphl VR128:$x, VR128:$y)),
- (and VR128:$x, (not (z_vicmphl VR128:$x, VR128:$y))))),
- (VMNLQ VR128:$x, VR128:$y)>;
-}
+defm : UnsignedSaturatingTruncate<v8i16, v16i8, VPKLSH>;
+defm : UnsignedSaturatingTruncate<v4i32, v8i16, VPKLSF>;
+defm : UnsignedSaturatingTruncate<v2i64, v4i32, VPKLSG>;
// Instantiate comparison patterns to recognize VACC/VSCBI for TYPE.
multiclass IntegerComputeCarryOrBorrow<ValueType type,
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 39e216b993b1..547d3dcf9280 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -1067,6 +1067,31 @@ def vsplat_imm_eq_1 : PatFrag<(ops), (build_vector), [{
}]>;
def z_vzext1 : PatFrag<(ops node:$x), (and node:$x, vsplat_imm_eq_1)>;
+// Vector constants for saturating truncation, containing the minimum and
+// maximum value for the integer type that is half of the element width.
+def ssat_trunc_min_vec: PatFrag<(ops), (build_vector), [{
+ APInt Imm;
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+ unsigned SizeInBits = EltTy.getSizeInBits();
+ APInt min = APInt::getSignedMinValue(SizeInBits / 2).sext(SizeInBits);
+ return ISD::isConstantSplatVector(N, Imm) && APInt::isSameValue(Imm, min);
+}]>;
+def ssat_trunc_max_vec: PatFrag<(ops), (build_vector), [{
+ APInt Imm;
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+ unsigned SizeInBits = EltTy.getSizeInBits();
+ APInt max = APInt::getSignedMaxValue(SizeInBits / 2).sext(SizeInBits);
+ return ISD::isConstantSplatVector(N, Imm) && APInt::isSameValue(Imm, max);
+}]>;
+
+def usat_trunc_max_vec: PatFrag<(ops), (build_vector), [{
+ APInt Imm;
+ EVT EltTy = N->getValueType(0).getVectorElementType();
+ unsigned SizeInBits = EltTy.getSizeInBits();
+ APInt max = APInt::getMaxValue(SizeInBits / 2).zext(SizeInBits);
+ return ISD::isConstantSplatVector(N, Imm) && APInt::isSameValue(Imm, max);
+}]>;
+
// Signed "integer greater than zero" on vectors.
def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, immAllZerosV)>;
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 28495e7c5719..343bcce80e3a 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -191,8 +191,9 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
}
}
-void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(MCStreamer &Streamer,
- Module &M) const {
+void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(
+ MCStreamer &Streamer, Module &M,
+ std::function<void(MCStreamer &Streamer)> COMDATSymEmitter) const {
NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName);
if (!FuncInfo)
return;
@@ -213,6 +214,11 @@ void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(MCStreamer &Streamer,
TM->getFunctionSections() ? Name->getString() : StringRef());
Streamer.switchSection(S);
+
+ // emit COFF COMDAT symbol.
+ if (COMDATSymEmitter)
+ COMDATSymEmitter(Streamer);
+
Streamer.emitInt64(GUID->getZExtValue());
Streamer.emitInt64(Hash->getZExtValue());
Streamer.emitULEB128IntValue(Name->getString().size());
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 69b6e26e602f..ad7e503cb155 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -162,7 +162,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
- RESET_OPTION(ApproxFuncFPMath, "approx-func-fp-math");
}
/// Returns the code generation relocation model. The choices are static, PIC,
diff --git a/llvm/lib/Target/TargetMachineC.cpp b/llvm/lib/Target/TargetMachineC.cpp
index da6d35c8c8b4..aba6ea436e76 100644
--- a/llvm/lib/Target/TargetMachineC.cpp
+++ b/llvm/lib/Target/TargetMachineC.cpp
@@ -83,7 +83,8 @@ LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
char **ErrorMessage) {
std::string Error;
- *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
+ Triple TT(TripleStr);
+ *T = wrap(TargetRegistry::lookupTarget(TT, Error));
if (!*T) {
if (ErrorMessage)
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index d7e1666a7417..aad826b5f285 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -57,6 +57,7 @@ LLVMInitializeVEDisassembler() {
createVEDisassembler);
}
+// clang-format off
static const unsigned I32RegDecoderTable[] = {
VE::SW0, VE::SW1, VE::SW2, VE::SW3, VE::SW4, VE::SW5, VE::SW6,
VE::SW7, VE::SW8, VE::SW9, VE::SW10, VE::SW11, VE::SW12, VE::SW13,
@@ -127,6 +128,7 @@ static const unsigned MiscRegDecoderTable[] = {
VE::PMC4, VE::PMC5, VE::PMC6, VE::PMC7,
VE::PMC8, VE::PMC9, VE::PMC10, VE::PMC11,
VE::PMC12, VE::PMC13, VE::PMC14};
+// clang-format on
static DecodeStatus DecodeI32RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
@@ -214,106 +216,6 @@ static DecodeStatus DecodeMISCRegisterClass(MCInst &Inst, unsigned RegNo,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeASX(MCInst &Inst, uint64_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLoadI32(MCInst &Inst, uint64_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeStoreI32(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLoadI64(MCInst &Inst, uint64_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeStoreI64(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLoadF32(MCInst &Inst, uint64_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeStoreF32(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeLoadASI64(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeStoreASI64(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeTS1AMI64(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeTS1AMI32(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCASI64(MCInst &Inst, uint64_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCASI32(MCInst &Inst, uint64_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCall(MCInst &Inst, uint64_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSIMM7(MCInst &Inst, uint64_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeSIMM32(MCInst &Inst, uint64_t insn, uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeCCOperand(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeRDOperand(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeBranchCondition(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-static DecodeStatus DecodeBranchConditionAlways(MCInst &Inst, uint64_t insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-#include "VEGenDisassemblerTables.inc"
-
-/// Read four bytes from the ArrayRef and return 32 bit word.
-static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address,
- uint64_t &Size, uint64_t &Insn,
- bool IsLittleEndian) {
- // We want to read exactly 8 Bytes of data.
- if (Bytes.size() < 8) {
- Size = 0;
- return MCDisassembler::Fail;
- }
-
- Insn = IsLittleEndian
- ? ((uint64_t)Bytes[0] << 0) | ((uint64_t)Bytes[1] << 8) |
- ((uint64_t)Bytes[2] << 16) | ((uint64_t)Bytes[3] << 24) |
- ((uint64_t)Bytes[4] << 32) | ((uint64_t)Bytes[5] << 40) |
- ((uint64_t)Bytes[6] << 48) | ((uint64_t)Bytes[7] << 56)
- : ((uint64_t)Bytes[7] << 0) | ((uint64_t)Bytes[6] << 8) |
- ((uint64_t)Bytes[5] << 16) | ((uint64_t)Bytes[4] << 24) |
- ((uint64_t)Bytes[3] << 32) | ((uint64_t)Bytes[2] << 40) |
- ((uint64_t)Bytes[1] << 48) | ((uint64_t)Bytes[0] << 56);
-
- return MCDisassembler::Success;
-}
-
-DecodeStatus VEDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
- ArrayRef<uint8_t> Bytes,
- uint64_t Address,
- raw_ostream &CStream) const {
- uint64_t Insn;
- bool isLittleEndian = getContext().getAsmInfo()->isLittleEndian();
- DecodeStatus Result =
- readInstruction64(Bytes, Address, Size, Insn, isLittleEndian);
- if (Result == MCDisassembler::Fail)
- return MCDisassembler::Fail;
-
- // Calling the auto-generated decoder function.
-
- Result = decodeInstruction(DecoderTableVE64, Instr, Insn, Address, this, STI);
-
- if (Result != MCDisassembler::Fail) {
- Size = 8;
- return Result;
- }
-
- return MCDisassembler::Fail;
-}
-
typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned RegNo, uint64_t Address,
const MCDisassembler *Decoder);
@@ -629,3 +531,51 @@ static DecodeStatus DecodeBranchConditionAlways(MCInst &MI, uint64_t insn,
// Decode MEMri.
return DecodeAS(MI, insn, Address, Decoder);
}
+
+#include "VEGenDisassemblerTables.inc"
+
+/// Read four bytes from the ArrayRef and return 32 bit word.
+static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &Size, uint64_t &Insn,
+ bool IsLittleEndian) {
+ // We want to read exactly 8 Bytes of data.
+ if (Bytes.size() < 8) {
+ Size = 0;
+ return MCDisassembler::Fail;
+ }
+
+ Insn = IsLittleEndian
+ ? ((uint64_t)Bytes[0] << 0) | ((uint64_t)Bytes[1] << 8) |
+ ((uint64_t)Bytes[2] << 16) | ((uint64_t)Bytes[3] << 24) |
+ ((uint64_t)Bytes[4] << 32) | ((uint64_t)Bytes[5] << 40) |
+ ((uint64_t)Bytes[6] << 48) | ((uint64_t)Bytes[7] << 56)
+ : ((uint64_t)Bytes[7] << 0) | ((uint64_t)Bytes[6] << 8) |
+ ((uint64_t)Bytes[5] << 16) | ((uint64_t)Bytes[4] << 24) |
+ ((uint64_t)Bytes[3] << 32) | ((uint64_t)Bytes[2] << 40) |
+ ((uint64_t)Bytes[1] << 48) | ((uint64_t)Bytes[0] << 56);
+
+ return MCDisassembler::Success;
+}
+
+DecodeStatus VEDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CStream) const {
+ uint64_t Insn;
+ bool isLittleEndian = getContext().getAsmInfo()->isLittleEndian();
+ DecodeStatus Result =
+ readInstruction64(Bytes, Address, Size, Insn, isLittleEndian);
+ if (Result == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ // Calling the auto-generated decoder function.
+
+ Result = decodeInstruction(DecoderTableVE64, Instr, Insn, Address, this, STI);
+
+ if (Result != MCDisassembler::Fail) {
+ Size = 8;
+ return Result;
+ }
+
+ return MCDisassembler::Fail;
+}
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index 98e4b452a8a5..d5e804afd27f 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -34,8 +34,8 @@ using namespace llvm;
// Pin the vtable to this file.
void VEInstrInfo::anchor() {}
-VEInstrInfo::VEInstrInfo(VESubtarget &ST)
- : VEGenInstrInfo(VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {}
+VEInstrInfo::VEInstrInfo(const VESubtarget &ST)
+ : VEGenInstrInfo(ST, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {}
static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); }
diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h
index 49dcba503462..408d3ab9e05f 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/llvm/lib/Target/VE/VEInstrInfo.h
@@ -53,7 +53,7 @@ class VEInstrInfo : public VEGenInstrInfo {
virtual void anchor();
public:
- explicit VEInstrInfo(VESubtarget &ST);
+ explicit VEInstrInfo(const VESubtarget &ST);
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 7e3f29b3bd82..9869f95ae566 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -39,6 +39,8 @@ include "VEInstrFormats.td"
// e.g. 0.0 (0x00000000) or -2.0 (0xC0000000=(2)1).
//===----------------------------------------------------------------------===//
+defvar ve_ptr_rc = I64;
+
def ULO7 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() & 0x7f,
SDLoc(N), MVT::i32);
@@ -325,17 +327,17 @@ def VEMEMziiAsmOperand : AsmOperandClass {
// ASX format uses single assembly instruction format.
def MEMrri : Operand<iPTR> {
let PrintMethod = "printMemASXOperand";
- let MIOperandInfo = (ops ptr_rc, ptr_rc, i64imm);
+ let MIOperandInfo = (ops ve_ptr_rc, ve_ptr_rc, i64imm);
let ParserMatchClass = VEMEMrriAsmOperand;
}
def MEMrii : Operand<iPTR> {
let PrintMethod = "printMemASXOperand";
- let MIOperandInfo = (ops ptr_rc, i32imm, i64imm);
+ let MIOperandInfo = (ops ve_ptr_rc, i32imm, i64imm);
let ParserMatchClass = VEMEMriiAsmOperand;
}
def MEMzri : Operand<iPTR> {
let PrintMethod = "printMemASXOperand";
- let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i64imm);
+ let MIOperandInfo = (ops i32imm /* = 0 */, ve_ptr_rc, i64imm);
let ParserMatchClass = VEMEMzriAsmOperand;
}
def MEMzii : Operand<iPTR> {
@@ -358,7 +360,7 @@ def VEMEMziAsmOperand : AsmOperandClass {
// 1. AS generic assembly instruction format:
def MEMriASX : Operand<iPTR> {
let PrintMethod = "printMemASOperandASX";
- let MIOperandInfo = (ops ptr_rc, i32imm);
+ let MIOperandInfo = (ops ve_ptr_rc, i32imm);
let ParserMatchClass = VEMEMriAsmOperand;
}
def MEMziASX : Operand<iPTR> {
@@ -370,7 +372,7 @@ def MEMziASX : Operand<iPTR> {
// 2. AS RRM style assembly instruction format:
def MEMriRRM : Operand<iPTR> {
let PrintMethod = "printMemASOperandRRM";
- let MIOperandInfo = (ops ptr_rc, i32imm);
+ let MIOperandInfo = (ops ve_ptr_rc, i32imm);
let ParserMatchClass = VEMEMriAsmOperand;
}
def MEMziRRM : Operand<iPTR> {
@@ -382,7 +384,7 @@ def MEMziRRM : Operand<iPTR> {
// 3. AS HM style assembly instruction format:
def MEMriHM : Operand<iPTR> {
let PrintMethod = "printMemASOperandHM";
- let MIOperandInfo = (ops ptr_rc, i32imm);
+ let MIOperandInfo = (ops ve_ptr_rc, i32imm);
let ParserMatchClass = VEMEMriAsmOperand;
}
def MEMziHM : Operand<iPTR> {
@@ -642,7 +644,7 @@ multiclass RRIm<string opcStr, bits<8>opc,
// Special RR multiclass for 128 bits shift left instruction.
// e.g. SLD
-let Constraints = "$hi = $sx", DisableEncoding = "$hi", hasSideEffects = 0 in
+let Constraints = "$hi = $sx", hasSideEffects = 0 in
multiclass RRILDm<string opcStr, bits<8>opc, RegisterClass RC> {
def rrr : RR<opc, (outs RC:$sx), (ins RC:$hi, RC:$sz, I32:$sy),
!strconcat(opcStr, " $sx, $sz, $sy")>;
@@ -659,7 +661,7 @@ multiclass RRILDm<string opcStr, bits<8>opc, RegisterClass RC> {
// Special RR multiclass for 128 bits shift right instruction.
// e.g. SRD
-let Constraints = "$low = $sx", DisableEncoding = "$low", hasSideEffects = 0 in
+let Constraints = "$low = $sx", hasSideEffects = 0 in
multiclass RRIRDm<string opcStr, bits<8>opc, RegisterClass RC> {
def rrr : RR<opc, (outs RC:$sx), (ins RC:$sz, RC:$low, I32:$sy),
!strconcat(opcStr, " $sx, $sz, $sy")>;
@@ -689,7 +691,7 @@ multiclass RRI1m<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
// Special RR multiclass for MRG instruction.
// e.g. MRG
-let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0 in
+let Constraints = "$sx = $sd", hasSideEffects = 0 in
multiclass RRMRGm<string opcStr, bits<8>opc, RegisterClass RC> {
def rr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, RC:$sd),
!strconcat(opcStr, " $sx, $sy, $sz")>;
@@ -722,7 +724,7 @@ multiclass RRSWPm<string opcStr, bits<8>opc,
// Multiclass for CMOV instructions.
// e.g. CMOVL, CMOVW, CMOVD, and etc.
-let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0,
+let Constraints = "$sx = $sd", hasSideEffects = 0,
cfw = ? in
multiclass RRCMOVm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
SDPatternOperator OpNode = null_frag,
@@ -805,7 +807,7 @@ multiclass PFCHm<string opcStr, bits<8>opc> {
// Multiclass for CAS instructions.
// e.g. TS1AML, TS1AMW, TS2AM, and etc.
-let Constraints = "$sx = $sd", DisableEncoding = "$sd",
+let Constraints = "$sx = $sd",
mayStore=1, mayLoad = 1, hasSideEffects = 0 in
multiclass RRCAStgm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
Operand immOp, Operand MEM, ComplexPattern ADDR,
@@ -920,7 +922,7 @@ multiclass STORECRm<string opcStr, bits<8>opc, RegisterClass RC> {
!strconcat(opcStr, " $sx, $sy, $sz")>;
}
-let hasSideEffects = 1, Constraints = "$sx = $sx_in", DisableEncoding = "$sx_in" in
+let hasSideEffects = 1, Constraints = "$sx = $sx_in" in
multiclass TSCRm<string opcStr, bits<8>opc, RegisterClass RC> {
def rrr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, RC:$sx_in),
!strconcat(opcStr, " $sx, $sy, $sz")>;
diff --git a/llvm/lib/Target/VE/VEInstrVec.td b/llvm/lib/Target/VE/VEInstrVec.td
index 327ad9ceacc5..e0989bf6ad23 100644
--- a/llvm/lib/Target/VE/VEInstrVec.td
+++ b/llvm/lib/Target/VE/VEInstrVec.td
@@ -35,7 +35,7 @@ def STVM512rii : Pseudo<
// LVM/SVM instructions using VM512
let hasSideEffects = 0, isCodeGenOnly = 1 in {
- let Constraints = "$vx = $vd", DisableEncoding = "$vd" in {
+ let Constraints = "$vx = $vd" in {
def LVMyir_y : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, I64:$sz, VM512:$vd),
"# pseudo LVM $vx, $sy, $sz, $vd">;
def LVMyim_y : Pseudo<(outs VM512:$vx),
@@ -51,7 +51,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in {
}
// VFMK/VFMKW/VFMKS instructions using VM512
-let hasSideEffects = 0, isCodeGenOnly = 1, DisableEncoding = "$vl" in {
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
def VFMKyal : Pseudo<(outs VM512:$vmx), (ins I32:$vl),
"# pseudo-vfmk.at $vmx">;
def VFMKynal : Pseudo<(outs VM512:$vmx), (ins I32:$vl),
@@ -126,21 +126,18 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in {
// Multiclass for VLD instructions
let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in
-multiclass VLDbm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in,
- string disEnc = ""> {
- let DisableEncoding = disEnc in
+multiclass VLDbm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in> {
def "" : RVM<opc, (outs RC:$vx), dag_in,
!strconcat(opcStr, " $vx, $sy, $sz")>;
- let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
- isCodeGenOnly = 1 in
+ let Constraints = "$vx = $base", isCodeGenOnly = 1 in
def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
!strconcat(opcStr, " $vx, $sy, $sz")>;
}
multiclass VLDlm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in> {
defm "" : VLDbm<opcStr, opc, RC, dag_in>;
let isCodeGenOnly = 1, VE_VLInUse = 1 in {
- defm l : VLDbm<opcStr, opc, RC, !con(dag_in, (ins I32:$vl)), "$vl,">;
- defm L : VLDbm<opcStr, opc, RC, !con(dag_in, (ins VLS:$vl)), "$vl,">;
+ defm l : VLDbm<opcStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+ defm L : VLDbm<opcStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
}
}
let VE_VLIndex = 3 in
@@ -182,7 +179,7 @@ let cx = 1 in defm VLDL2DZX : VLDm<"vldl2d.zx", 0xc3, V64>;
let mayStore = 1, hasSideEffects = 0, Uses = [VL] in
multiclass VSTbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
- let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
!strconcat(opcStr, argStr)>;
def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
@@ -232,12 +229,10 @@ defm VSTL2D : VSTm<"vstl2d", 0xd3, V64>;
// Multiclass for VGT instructions
let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in
multiclass VGTbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
- dag dag_in, string disEnc = ""> {
- let DisableEncoding = disEnc in
+ dag dag_in> {
def "" : RVM<opc, (outs RC:$vx), dag_in,
!strconcat(opcStr, " $vx, ", argStr)>;
- let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
- isCodeGenOnly = 1 in
+ let Constraints = "$vx = $base", isCodeGenOnly = 1 in
def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
!strconcat(opcStr, " $vx, ", argStr)>;
}
@@ -245,10 +240,8 @@ multiclass VGTlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
dag dag_in> {
defm "" : VGTbm<opcStr, argStr, opc, RC, dag_in>;
let isCodeGenOnly = 1, VE_VLInUse = 1 in {
- defm l : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
- "$vl,">;
- defm L : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
- "$vl,">;
+ defm l : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+ defm L : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
}
}
multiclass VGTmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
@@ -297,7 +290,7 @@ def : MnemonicAlias<"vgtl.nc", "vgtl.zx.nc">;
let mayStore = 1, hasSideEffects = 0, Uses = [VL] in
multiclass VSCbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
- let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
!strconcat(opcStr, argStr)>;
def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
@@ -348,7 +341,7 @@ defm VSCL : VSCm<"vscl", 0xb3, V64>;
let Uses = [VL] in
multiclass PFCHVbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
- let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
!strconcat(opcStr, argStr)>;
def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
@@ -373,8 +366,7 @@ let sx = 0, vx = ?, hasSideEffects = 0 in
multiclass LSVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
dag dag_in> {
def "" : RR<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " ${vx}", argStr)>;
- let Constraints = "$vx = $base", DisableEncoding = "$base",
- isCodeGenOnly = 1 in
+ let Constraints = "$vx = $base", isCodeGenOnly = 1 in
def _v : RR<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
!strconcat(opcStr, " ${vx}", argStr)>;
}
@@ -406,8 +398,7 @@ multiclass LVMbm<string opcStr, string argStr, bits<8>opc, RegisterClass RCM,
dag dag_in> {
def "" : RR<opc, (outs RCM:$vx), dag_in,
!strconcat(opcStr, " $vx, ", argStr)>;
- let Constraints = "$vx = $base", DisableEncoding = "$base",
- isCodeGenOnly = 1 in {
+ let Constraints = "$vx = $base", isCodeGenOnly = 1 in {
def _m : RR<opc, (outs RCM:$vx), !con(dag_in, (ins RCM:$base)),
!strconcat(opcStr, " $vx, ", argStr)>;
}
@@ -440,11 +431,10 @@ defm SVM : SVMm<"svm", 0xa7, VM>;
// Section 8.9.24 - VBRD (Vector Broadcast)
let vx = ?, hasSideEffects = 0, Uses = [VL] in
multiclass VBRDbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
- dag dag_in, string disEnc = ""> {
- let DisableEncoding = disEnc in
+ dag dag_in> {
def "" : RV<opc, (outs RC:$vx), dag_in,
!strconcat(opcStr, " $vx, ", argStr)>;
- let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+ let Constraints = "$vx = $base",
isCodeGenOnly = 1 in
def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
!strconcat(opcStr, " $vx, ", argStr)>;
@@ -453,10 +443,8 @@ multiclass VBRDlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
dag dag_in> {
defm "" : VBRDbm<opcStr, argStr, opc, RC, dag_in>;
let isCodeGenOnly = 1, VE_VLInUse = 1 in {
- defm l : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
- "$vl,">;
- defm L : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
- "$vl,">;
+ defm l : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+ defm L : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
}
}
multiclass VBRDmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
@@ -484,11 +472,10 @@ defm PVBRD : VBRDm<"pvbrd", 0x8c, V64, I64, VM512>;
// Section 8.9.25 - VMV (Vector Move)
let vx = ?, vz = ?, hasSideEffects = 0, Uses = [VL] in
multiclass VMVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
- dag dag_in, string disEnc = ""> {
- let DisableEncoding = disEnc in
+ dag dag_in> {
def "" : RV<opc, (outs RC:$vx), dag_in,
!strconcat(opcStr, " $vx, ", argStr)>;
- let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+ let Constraints = "$vx = $base",
isCodeGenOnly = 1 in
def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
!strconcat(opcStr, " $vx, ", argStr)>;
@@ -497,10 +484,8 @@ multiclass VMVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
dag dag_in> {
defm "" : VMVbm<opcStr, argStr, opc, RC, dag_in>;
let isCodeGenOnly = 1, VE_VLInUse = 1 in {
- defm l : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
- "$vl,">;
- defm L : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
- "$vl,">;
+ defm l : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+ defm L : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
}
}
multiclass VMVmm<string opcStr, bits<8>opc, RegisterClass RC,
@@ -525,12 +510,10 @@ defm VMV : VMVm<"vmv", 0x9c, V64, VM>;
// Multiclass for generic vector calculation
let vx = ?, hasSideEffects = 0, Uses = [VL] in
multiclass RVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
- dag dag_in, string disEnc = ""> {
- let DisableEncoding = disEnc in
+ dag dag_in> {
def "" : RV<opc, (outs RC:$vx), dag_in,
!strconcat(opcStr, " $vx", argStr)>;
- let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
- isCodeGenOnly = 1 in
+ let Constraints = "$vx = $base", isCodeGenOnly = 1 in
def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
!strconcat(opcStr, " $vx", argStr)>;
}
@@ -538,10 +521,8 @@ multiclass RVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
dag dag_in> {
defm "" : RVbm<opcStr, argStr, opc, RC, dag_in>;
let isCodeGenOnly = 1, VE_VLInUse = 1 in {
- defm l : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
- "$vl,">;
- defm L : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
- "$vl,">;
+ defm l : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+ defm L : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
}
}
multiclass RVmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
@@ -688,11 +669,10 @@ multiclass RVFIXm<string opcStr, bits<8> opc, RegisterClass RC,
// Multiclass for generic iterative vector calculation
let vx = ?, hasSideEffects = 0, Uses = [VL] in
multiclass RVIbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
- dag dag_in, string disEnc = ""> {
- let DisableEncoding = disEnc in
+ dag dag_in> {
def "" : RV<opc, (outs RC:$vx), dag_in,
!strconcat(opcStr, " $vx", argStr)>;
- let isCodeGenOnly = 1, Constraints = "$vx = $base", DisableEncoding = disEnc#"$base" in
+ let isCodeGenOnly = 1, Constraints = "$vx = $base" in
def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
!strconcat(opcStr, " $vx", argStr)>;
}
@@ -700,10 +680,8 @@ multiclass RVIlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
dag dag_in> {
defm "" : RVIbm<opcStr, argStr, opc, RC, dag_in>;
let isCodeGenOnly = 1, VE_VLInUse = 1 in {
- defm l : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
- "$vl,">;
- defm L : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
- "$vl,">;
+ defm l : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+ defm L : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
}
}
// Generic RV multiclass for iterative operation with 2 argument.
@@ -743,7 +721,7 @@ let vx = ?, hasSideEffects = 0, Uses = [VL] in
multiclass RVMKbm<string opcStr, string argStr, bits<8>opc, dag dag_out,
dag dag_in> {
def "" : RV<opc, dag_out, dag_in, !strconcat(opcStr, argStr)>;
- let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
def l : RV<opc, dag_out, !con(dag_in, (ins I32:$vl)),
!strconcat(opcStr, argStr)>;
def L : RV<opc, dag_out, !con(dag_in, (ins VLS:$vl)),
@@ -796,7 +774,7 @@ multiclass RVMSbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
bits<7> sx;
let Inst{54-48} = sx;
}
- let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
def l : RV<opc, (outs I64:$sx), !con(dag_in, (ins I32:$vl)),
!strconcat(opcStr, " $sx,", argStr)> {
bits<7> sx;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index 378ef2c8f250..1eae3586d16b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -27,6 +27,7 @@ HANDLE_NODETYPE(WrapperREL)
HANDLE_NODETYPE(BR_IF)
HANDLE_NODETYPE(BR_TABLE)
HANDLE_NODETYPE(DOT)
+HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U)
HANDLE_NODETYPE(SHUFFLE)
HANDLE_NODETYPE(SWIZZLE)
HANDLE_NODETYPE(VEC_SHL)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index c6c2d0cfccb6..fe100dab427e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2183,13 +2183,10 @@ SDValue performLowerPartialReduction(SDNode *N, SelectionDAG &DAG) {
SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS);
SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS);
- SDValue LowLow = DAG.getNode(LowOpc, DL, MVT::v4i32, MulLow);
- SDValue LowHigh = DAG.getNode(LowOpc, DL, MVT::v4i32, MulHigh);
- SDValue HighLow = DAG.getNode(HighOpc, DL, MVT::v4i32, MulLow);
- SDValue HighHigh = DAG.getNode(HighOpc, DL, MVT::v4i32, MulHigh);
-
- SDValue AddLow = DAG.getNode(ISD::ADD, DL, MVT::v4i32, LowLow, HighLow);
- SDValue AddHigh = DAG.getNode(ISD::ADD, DL, MVT::v4i32, LowHigh, HighHigh);
+ SDValue AddLow =
+ DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL, MVT::v4i32, MulLow);
+ SDValue AddHigh = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL,
+ MVT::v4i32, MulHigh);
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh);
return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
}
@@ -3588,34 +3585,53 @@ static SDValue performMulCombine(SDNode *N,
if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
return Res;
- // We don't natively support v16i8 mul, but we do support v8i16 so split the
- // inputs and extend them to v8i16. Only do this before legalization in case
- // a narrow vector is widened and may be simplified later.
- if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
+ // We don't natively support v16i8 or v8i8 mul, but we do support v8i16. So,
+ // extend them to v8i16. Only do this before legalization in case a narrow
+ // vector is widened and may be simplified later.
+ if (!DCI.isBeforeLegalize() || (VT != MVT::v8i8 && VT != MVT::v16i8))
return SDValue();
SDLoc DL(N);
SelectionDAG &DAG = DCI.DAG;
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- SDValue LowLHS =
- DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
- SDValue HighLHS =
- DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
- SDValue LowRHS =
- DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
- SDValue HighRHS =
- DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
-
- SDValue MulLow =
- DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
- SDValue MulHigh = DAG.getBitcast(
- VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
-
- // Take the low byte of each lane.
- return DAG.getVectorShuffle(
- VT, DL, MulLow, MulHigh,
- {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+ EVT MulVT = MVT::v8i16;
+
+ if (VT == MVT::v8i8) {
+ SDValue PromotedLHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, LHS,
+ DAG.getUNDEF(MVT::v8i8));
+ SDValue PromotedRHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, RHS,
+ DAG.getUNDEF(MVT::v8i8));
+ SDValue LowLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedLHS);
+ SDValue LowRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedRHS);
+ SDValue MulLow = DAG.getBitcast(
+ MVT::v16i8, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS));
+ // Take the low byte of each lane.
+ SDValue Shuffle = DAG.getVectorShuffle(
+ MVT::v16i8, DL, MulLow, DAG.getUNDEF(MVT::v16i8),
+ {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
+ return extractSubVector(Shuffle, 0, DAG, DL, 64);
+ } else {
+ assert(VT == MVT::v16i8 && "Expected v16i8");
+ SDValue LowLHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, LHS);
+ SDValue LowRHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, RHS);
+ SDValue HighLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, LHS);
+ SDValue HighRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, RHS);
+
+ SDValue MulLow =
+ DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS));
+ SDValue MulHigh =
+ DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, HighLHS, HighRHS));
+
+ // Take the low byte of each lane.
+ return DAG.getVectorShuffle(
+ VT, DL, MulLow, MulHigh,
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+ }
}
SDValue
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index a934853ff9f4..feac04a17068 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
#include "WebAssemblyGenInstrInfo.inc"
WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
- : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
+ : WebAssemblyGenInstrInfo(STI, WebAssembly::ADJCALLSTACKDOWN,
WebAssembly::ADJCALLSTACKUP,
WebAssembly::CATCHRET),
RI(STI.getTargetTriple()) {}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index f06f8d5174e3..3c26b453c448 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1453,15 +1453,22 @@ if !ne(t1, t2) then
def : Pat<(t1.vt (bitconvert (t2.vt V128:$v))), (t1.vt V128:$v)>;
// Extended pairwise addition
+def extadd_pairwise_u : SDNode<"WebAssemblyISD::EXT_ADD_PAIRWISE_U", extend_t>;
+
defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_signed,
"extadd_pairwise_i8x16_s", 0x7c>;
-defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_unsigned,
+defm "" : SIMDConvert<I16x8, I8x16, extadd_pairwise_u,
"extadd_pairwise_i8x16_u", 0x7d>;
defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_signed,
"extadd_pairwise_i16x8_s", 0x7e>;
-defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_unsigned,
+defm "" : SIMDConvert<I32x4, I16x8, extadd_pairwise_u,
"extadd_pairwise_i16x8_u", 0x7f>;
+def : Pat<(v4i32 (int_wasm_extadd_pairwise_unsigned (v8i16 V128:$in))),
+ (extadd_pairwise_u_I32x4 V128:$in)>;
+def : Pat<(v8i16 (int_wasm_extadd_pairwise_unsigned (v16i8 V128:$in))),
+ (extadd_pairwise_u_I16x8 V128:$in)>;
+
// f64x2 <-> f32x4 conversions
def demote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
def demote_zero : SDNode<"WebAssemblyISD::DEMOTE_ZERO", demote_t>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index bc91c6424b63..08ca20b5eef6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -247,7 +247,8 @@ static void query(const MachineInstr &MI, bool &Read, bool &Write,
// Check for writes to __stack_pointer global.
if ((MI.getOpcode() == WebAssembly::GLOBAL_SET_I32 ||
MI.getOpcode() == WebAssembly::GLOBAL_SET_I64) &&
- strcmp(MI.getOperand(0).getSymbolName(), "__stack_pointer") == 0)
+ MI.getOperand(0).isSymbol() &&
+ !strcmp(MI.getOperand(0).getSymbolName(), "__stack_pointer"))
StackPointer = true;
// Analyze calls.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 08fb7586d215..0eefd3e2b350 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -166,12 +166,6 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
CostKind);
}
- int ISD = TLI->InstructionOpcodeToISD(Opcode);
- if (ISD != ISD::LOAD) {
- return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
- CostKind);
- }
-
EVT VT = TLI->getValueType(DL, Ty, true);
// Type legalization can't handle structs
if (VT == MVT::Other)
@@ -182,22 +176,121 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
if (!LT.first.isValid())
return InstructionCost::getInvalid();
- // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
- // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
- // are twice as expensive as scalar.
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
unsigned width = VT.getSizeInBits();
- switch (width) {
- default:
- break;
- case 32:
- case 64:
- case 128:
- return 2;
+ if (ISD == ISD::LOAD) {
+ // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
+ // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
+ // loads are twice as expensive as scalar.
+ switch (width) {
+ default:
+ break;
+ case 32:
+ case 64:
+ case 128:
+ return 2;
+ }
+ } else if (ISD == ISD::STORE) {
+ // For stores, we can use store lane operations.
+ switch (width) {
+ default:
+ break;
+ case 8:
+ case 16:
+ case 32:
+ case 64:
+ case 128:
+ return 2;
+ }
}
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
}
+InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) const {
+ assert(Factor >= 2 && "Invalid interleave factor");
+
+ auto *VecTy = cast<VectorType>(Ty);
+ if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) {
+ return InstructionCost::getInvalid();
+ }
+
+ if (UseMaskForCond || UseMaskForGaps)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
+ Alignment, AddressSpace, CostKind,
+ UseMaskForCond, UseMaskForGaps);
+
+ constexpr unsigned MaxInterleaveFactor = 4;
+ if (Factor <= MaxInterleaveFactor) {
+ unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
+ // Ensure the number of vector elements is greater than 1.
+ if (MinElts < 2 || MinElts % Factor != 0)
+ return InstructionCost::getInvalid();
+
+ unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+ // Ensure the element type is legal.
+ if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
+ return InstructionCost::getInvalid();
+
+ auto *SubVecTy =
+ VectorType::get(VecTy->getElementType(),
+ VecTy->getElementCount().divideCoefficientBy(Factor));
+ InstructionCost MemCost =
+ getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);
+
+ unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
+ unsigned MaxVecSize = 128;
+ unsigned NumAccesses =
+ std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);
+
+ // A stride of two is commonly supported via dedicated instructions, so it
+ // should be relatively cheap for all element sizes. A stride of four is
+ // more expensive as it will likely require more shuffles. Using two
+ // simd128 inputs is considered more expensive and we mainly account for
+ // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
+ // arithmetic kernels.
+ static const CostTblEntry ShuffleCostTbl[] = {
+ // One reg.
+ {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8
+ {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8
+ {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8
+ {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
+ {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
+ {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32
+
+ // Two regs.
+ {2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
+ {2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
+ {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32
+
+ // One reg.
+ {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8
+ {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8
+ {4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16
+
+ // Two regs.
+ {4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
+ {4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
+ {4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32
+
+ // Four regs.
+ {4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
+ };
+
+ EVT ETy = TLI->getValueType(DL, SubVecTy);
+ if (const auto *Entry =
+ CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
+ return Entry->Cost + (NumAccesses * MemCost);
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, CostKind,
+ UseMaskForCond, UseMaskForGaps);
+}
+
InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
const Value *Op0, const Value *Op1) const {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index c915eeb07d4f..2573066cd5d6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -82,6 +82,10 @@ public:
TTI::TargetCostKind CostKind,
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
const Instruction *I = nullptr) const override;
+ InstructionCost getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) const override;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d7671ed19589..ce5e92135f70 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -15,10 +15,12 @@
#include "MCTargetDesc/X86TargetStreamer.h"
#include "TargetInfo/X86TargetInfo.h"
#include "X86Operand.h"
+#include "X86RegisterInfo.h"
#include "llvm-c/Visibility.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCContext.h"
@@ -29,6 +31,7 @@
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegister.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCStreamer.h"
@@ -40,6 +43,7 @@
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
+#include <cstdint>
#include <memory>
using namespace llvm;
@@ -1172,7 +1176,7 @@ private:
X86::CondCode ParseConditionCode(StringRef CCode);
- bool ParseIntelMemoryOperandSize(unsigned &Size);
+ bool ParseIntelMemoryOperandSize(unsigned &Size, StringRef *SizeStr);
bool CreateMemForMSInlineAsm(MCRegister SegReg, const MCExpr *Disp,
MCRegister BaseReg, MCRegister IndexReg,
unsigned Scale, bool NonAbsMem, SMLoc Start,
@@ -2574,7 +2578,8 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
return false;
}
-bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
+bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size,
+ StringRef *SizeStr) {
Size = StringSwitch<unsigned>(getTok().getString())
.Cases("BYTE", "byte", 8)
.Cases("WORD", "word", 16)
@@ -2592,6 +2597,8 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
.Cases("ZMMWORD", "zmmword", 512)
.Default(0);
if (Size) {
+ if (SizeStr)
+ *SizeStr = getTok().getString();
const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word).
if (!(Tok.getString() == "PTR" || Tok.getString() == "ptr"))
return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
@@ -2600,6 +2607,19 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
return false;
}
+uint16_t RegSizeInBits(const MCRegisterInfo &MRI, MCRegister RegNo) {
+ if (X86MCRegisterClasses[X86::GR8RegClassID].contains(RegNo))
+ return 8;
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(RegNo))
+ return 16;
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(RegNo))
+ return 32;
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+ return 64;
+ // Unknown register size
+ return 0;
+}
+
bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
@@ -2607,7 +2627,8 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
// Parse optional Size directive.
unsigned Size;
- if (ParseIntelMemoryOperandSize(Size))
+ StringRef SizeStr;
+ if (ParseIntelMemoryOperandSize(Size, &SizeStr))
return true;
bool PtrInOperand = bool(Size);
@@ -2624,9 +2645,29 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
return Error(Start, "rip can only be used as a base register");
// A Register followed by ':' is considered a segment override
if (Tok.isNot(AsmToken::Colon)) {
- if (PtrInOperand)
- return Error(Start, "expected memory operand after 'ptr', "
- "found register operand instead");
+ if (PtrInOperand) {
+ if (!Parser.isParsingMasm())
+ return Error(Start, "expected memory operand after 'ptr', "
+ "found register operand instead");
+
+ // If we are parsing MASM, we are allowed to cast registers to their own
+ // sizes, but not to other types.
+ uint16_t RegSize =
+ RegSizeInBits(*getContext().getRegisterInfo(), RegNo);
+ if (RegSize == 0)
+ return Error(
+ Start,
+ "cannot cast register '" +
+ StringRef(getContext().getRegisterInfo()->getName(RegNo)) +
+ "'; its size is not easily defined.");
+ if (RegSize != Size)
+ return Error(
+ Start,
+ std::to_string(RegSize) + "-bit register '" +
+ StringRef(getContext().getRegisterInfo()->getName(RegNo)) +
+ "' cannot be used as a " + std::to_string(Size) + "-bit " +
+ SizeStr.upper());
+ }
Operands.push_back(X86Operand::CreateReg(RegNo, Start, End));
return false;
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 56a4cc3d65c2..865fc0ce8101 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -485,7 +485,16 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
if (!CanPadInst)
return;
- if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
+ if (PendingBA) {
+ auto *NextFragment = PendingBA->getNext();
+ assert(NextFragment && "NextFragment should not be null");
+ if (NextFragment == OS.getCurrentFragment())
+ return;
+ // We eagerly create an empty fragment when inserting a fragment
+ // with a variable-size tail.
+ if (NextFragment->getNext() == OS.getCurrentFragment())
+ return;
+
// Macro fusion actually happens and there is no other fragment inserted
// after the previous instruction.
//
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 547745fdba9d..76731437931a 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -1668,6 +1668,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DestName = getRegName(MI->getOperand(0).getReg());
break;
+ case X86::VMOVSHZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DecodeScalarMoveMask(8, false, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
case X86::MOVPQI2QIrr:
case X86::MOVZPQILo2PQIrr:
case X86::VMOVPQI2QIrr:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a15930c1433f..cfe5b1094811 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1047,9 +1047,6 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
Prefix.setL(TSFlags & X86II::VEX_L);
Prefix.setL2(TSFlags & X86II::EVEX_L2);
- if ((TSFlags & X86II::EVEX_L2) && STI.hasFeature(X86::FeatureAVX512) &&
- !STI.hasFeature(X86::FeatureEVEX512))
- report_fatal_error("ZMM registers are not supported without EVEX512");
switch (TSFlags & X86II::OpPrefixMask) {
case X86II::PD:
Prefix.setPP(0x1); // 66
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index cc7bcd678cb3..bb1e716c33ed 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -397,18 +397,6 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
if (CPU.empty())
CPU = "generic";
- size_t posNoEVEX512 = FS.rfind("-evex512");
- // Make sure we won't be cheated by "-avx512fp16".
- size_t posNoAVX512F =
- FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,");
- size_t posEVEX512 = FS.rfind("+evex512");
- size_t posAVX512F = FS.rfind("+avx512"); // Any AVX512XXX will enable AVX512F.
-
- if (posAVX512F != StringRef::npos &&
- (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F))
- if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos)
- ArchFS += ",+evex512";
-
return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
}
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9cfe081b8710..7c9e821c02fd 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -113,6 +113,7 @@ def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
"Support 16-bit floating point conversion instructions",
[FeatureAVX]>;
+// Deprecated feature. Keep it here to suppress warnings in old IRs.
def FeatureEVEX512 : SubtargetFeature<"evex512", "HasEVEX512", "true",
"Support ZMM and 64-bit mask instructions">;
def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512",
@@ -329,20 +330,22 @@ def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
"Support movdiri instruction (direct store integer)">;
def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
"Support movdir64b instruction (direct store 64 bytes)">;
-def FeatureAVX10_1 : SubtargetFeature<"avx10.1-256", "HasAVX10_1", "true",
- "Support AVX10.1 up to 256-bit instruction",
+def FeatureAVX10_1 : SubtargetFeature<"avx10.1", "HasAVX10_1", "true",
+ "Support AVX10.1 instruction",
[FeatureCDI, FeatureVBMI, FeatureIFMA, FeatureVNNI,
FeatureBF16, FeatureVPOPCNTDQ, FeatureVBMI2, FeatureBITALG,
FeatureFP16, FeatureVLX, FeatureDQI]>;
+// Deprecated feature. Keep it here to suppress warnings in old IRs.
def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true",
- "Support AVX10.1 up to 512-bit instruction",
- [FeatureAVX10_1, FeatureEVEX512]>;
-def FeatureAVX10_2 : SubtargetFeature<"avx10.2-256", "HasAVX10_2", "true",
- "Support AVX10.2 up to 256-bit instruction",
+ "Support AVX10.1 instruction",
+ [FeatureAVX10_1]>;
+def FeatureAVX10_2 : SubtargetFeature<"avx10.2", "HasAVX10_2", "true",
+ "Support AVX10.2 instruction",
[FeatureAVX10_1]>;
+// Deprecated feature. Keep it here to suppress warnings in old IRs.
def FeatureAVX10_2_512 : SubtargetFeature<"avx10.2-512", "HasAVX10_2_512", "true",
- "Support AVX10.2 up to 512-bit instruction",
- [FeatureAVX10_2, FeatureAVX10_1_512]>;
+ "Support AVX10.2 instruction",
+ [FeatureAVX10_2]>;
def FeatureEGPR : SubtargetFeature<"egpr", "HasEGPR", "true",
"Support extended general purpose register">;
def FeaturePush2Pop2 : SubtargetFeature<"push2pop2", "HasPush2Pop2", "true",
@@ -871,7 +874,6 @@ def ProcessorFeatures {
];
list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
- FeatureEVEX512,
FeatureBWI,
FeatureCDI,
FeatureDQI,
@@ -996,7 +998,6 @@ def ProcessorFeatures {
FeatureXSAVES,
FeatureCLFLUSHOPT,
FeatureAVX512,
- FeatureEVEX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
@@ -1039,7 +1040,6 @@ def ProcessorFeatures {
// Cannonlake
list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
- FeatureEVEX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
@@ -1155,7 +1155,7 @@ def ProcessorFeatures {
!listconcat(GNRFeatures, GNRDAdditionalFeatures);
// Diamond Rapids
- list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2_512,
+ list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2,
FeatureSM4,
FeatureCMPCCXADD,
FeatureAVXIFMA,
@@ -1368,7 +1368,6 @@ def ProcessorFeatures {
FeatureF16C,
FeatureFSGSBase,
FeatureAVX512,
- FeatureEVEX512,
FeatureCDI,
FeatureADX,
FeatureRDSEED,
@@ -1586,7 +1585,6 @@ def ProcessorFeatures {
list<SubtargetFeature> ZN4Tuning =
!listconcat(ZN3Tuning, ZN4AdditionalTuning);
list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
- FeatureEVEX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index d406277e440b..ff22ee8c86fa 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -476,7 +476,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
return MI.getDesc().isIndirectBranch() /*Make below code in a good shape*/ ||
Opc == X86::TAILJMPr || Opc == X86::TAILJMPm ||
Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 ||
- Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi ||
+ Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
+ Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNmi ||
Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 ||
Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TAILJMPr64_REX ||
Opc == X86::TAILJMPm64_REX;
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 0e6b4dffec3a..9457e718de69 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -269,6 +269,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::TCRETURNdi:
case X86::TCRETURNdicc:
case X86::TCRETURNri:
+ case X86::TCRETURN_WIN64ri:
+ case X86::TCRETURN_HIPE32ri:
case X86::TCRETURNmi:
case X86::TCRETURNdi64:
case X86::TCRETURNdi64cc:
@@ -346,8 +348,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
MIB.add(MBBI->getOperand(i));
- } else if ((Opcode == X86::TCRETURNri64) ||
- (Opcode == X86::TCRETURNri64_ImpCall)) {
+ } else if (Opcode == X86::TCRETURNri64 ||
+ Opcode == X86::TCRETURNri64_ImpCall ||
+ Opcode == X86::TCRETURN_WIN64ri) {
JumpTarget.setIsKill();
BuildMI(MBB, MBBI, DL,
TII->get(IsX64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index d3c239250943..787b71d425cb 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -564,8 +564,17 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
MachineBasicBlock::iterator I;
if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
I = ++LastShapeMI->getIterator();
- else
- I = ++MI.getIterator();
+ else {
+ // Call can overwrite registers like rax, ensure the tile config
+ // instruction is sinked closer to first instruction that uses tile.
+ auto UseIt = MI.getIterator();
+ while (UseIt != MBB.end()) {
+ if (HasTileOperand(MRI, *UseIt))
+ break;
+ ++UseIt;
+ }
+ I = UseIt;
+ }
Config(*I);
HasUnconfigTile = false;
continue;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index cba7843d53e3..a293b4c87cfe 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2398,7 +2398,8 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
}
static bool isTailCallOpcode(unsigned Opc) {
- return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
+ return Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
+ Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNdi ||
Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 ||
Opc == X86::TCRETURNmi64;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 19131fbd4102..3631016b0f5c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -326,15 +326,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasAVX10_2()) {
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom);
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal);
for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
MVT::v4i64}) {
setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal);
setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal);
}
- if (Subtarget.hasAVX10_2_512()) {
- setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);
- setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal);
- }
if (Subtarget.is64Bit()) {
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal);
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal);
@@ -2457,6 +2455,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
+ setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
+ setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
setOperationAction(ISD::FADD, VT, Legal);
setOperationAction(ISD::FSUB, VT, Legal);
@@ -2470,19 +2479,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
}
- if (Subtarget.hasAVX10_2_512()) {
- setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
- setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
- setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
- setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
- setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
- setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
- setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
- setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
- setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
- setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
- setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
- }
for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
setCondCodeAction(ISD::SETOEQ, VT, Custom);
setCondCodeAction(ISD::SETUNE, VT, Custom);
@@ -21252,7 +21248,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
// the truncation then we can use PACKSS by converting the srl to a sra.
// SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
if (In.getOpcode() == ISD::SRL && In->hasOneUse())
- if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
+ if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
if (*ShAmt == MinSignBits) {
PackOpcode = X86ISD::PACKSS;
return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
@@ -26269,10 +26265,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
-
- if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
- if (MaskConst->getZExtValue() & 0x1)
- return Op;
+ auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskConst && (MaskConst->getZExtValue() & 0x1))
+ return Op;
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -26288,6 +26283,17 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+
+ if (MaskConst) {
+ assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
+ // Discard op and blend passthrough with scalar op src/dst.
+ SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+ std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
+ ShuffleMask[0] = VT.getVectorNumElements();
+ return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
+ ShuffleMask);
+ }
+
return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
}
@@ -31404,9 +31410,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return R;
// AVX512 implicitly uses modulo rotation amounts.
- if ((Subtarget.hasVLX() ||
- (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
- 32 <= EltSizeInBits) {
+ if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
// Attempt to rotate by immediate.
if (IsCstSplat) {
unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
@@ -38676,13 +38680,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
if (Opc == X86ISD::VSHLI) {
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
+ Known <<= ShAmt;
// Low bits are known zero.
Known.Zero.setLowBits(ShAmt);
} else if (Opc == X86ISD::VSRLI) {
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// High bits are known zero.
Known.Zero.setHighBits(ShAmt);
} else {
@@ -44518,8 +44520,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
+ Known <<= ShAmt;
// Low bits known zero.
Known.Zero.setLowBits(ShAmt);
@@ -44549,8 +44550,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// High bits known zero.
Known.Zero.setHighBits(ShAmt);
@@ -44598,8 +44598,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// If the input sign bit is known to be zero, or if none of the top bits
// are demanded, turn this into an unsigned shift right.
@@ -44957,6 +44956,44 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
Known.Zero.setLowBits(Known2.countMinTrailingZeros());
return false;
}
+ case X86ISD::VPMADD52L:
+ case X86ISD::VPMADD52H: {
+ KnownBits KnownOp0, KnownOp1, KnownOp2;
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
+ // operand 2).
+ APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
+ if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
+ TLO, Depth + 1))
+ return true;
+
+ if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
+ TLO, Depth + 1))
+ return true;
+
+ if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
+ KnownOp2, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownMul;
+ KnownOp0 = KnownOp0.trunc(52);
+ KnownOp1 = KnownOp1.trunc(52);
+ KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
+ : KnownBits::mulhu(KnownOp0, KnownOp1);
+ KnownMul = KnownMul.zext(64);
+
+ // lo/hi(X * Y) + Z --> C + Z
+ if (KnownMul.isConstant()) {
+ SDLoc DL(Op);
+ SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
+ }
+
+ Known = KnownBits::add(KnownMul, KnownOp2);
+ return false;
+ }
}
return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -45132,6 +45169,14 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
switch (Op.getOpcode()) {
+ // SSE bit logic.
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR:
+ case X86ISD::FANDN:
+ case X86ISD::ANDNP:
+ case X86ISD::VPTERNLOG:
+ return false;
// SSE vector insert/extracts use modulo indices.
case X86ISD::PINSRB:
case X86ISD::PINSRW:
@@ -45167,6 +45212,11 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
// SSE signbit extraction.
case X86ISD::MOVMSK:
return false;
+ // GFNI instructions.
+ case X86ISD::GF2P8AFFINEINVQB:
+ case X86ISD::GF2P8AFFINEQB:
+ case X86ISD::GF2P8MULB:
+ return false;
case ISD::INTRINSIC_WO_CHAIN:
switch (Op->getConstantOperandVal(0)) {
case Intrinsic::x86_sse2_pmadd_wd:
@@ -48349,7 +48399,7 @@ static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC,
// If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
// peek through and adjust the TEST bit.
if (Src.getOpcode() == ISD::SHL) {
- if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) {
+ if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
Src = Src.getOperand(0);
BitMask.lshrInPlace(*ShiftAmt);
}
@@ -50886,10 +50936,12 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
// Given a target type \p VT, we generate
// or (and x, y), (xor z, zext(build_vector (constants)))
// given x, y and z are of type \p VT. We can do so, if operands are either
-// truncates from VT types, the second operand is a vector of constants or can
-// be recursively promoted.
+// truncates from VT types, the second operand is a vector of constants, can
+// be recursively promoted or is an existing extension we can extend further.
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
- SelectionDAG &DAG, unsigned Depth) {
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ unsigned Depth) {
// Limit recursion to avoid excessive compile times.
if (Depth >= SelectionDAG::MaxRecursionDepth)
return SDValue();
@@ -50904,28 +50956,32 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
return SDValue();
- if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
+ if (SDValue NN0 =
+ PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
N0 = NN0;
else {
- // The left side has to be a trunc.
- if (N0.getOpcode() != ISD::TRUNCATE)
- return SDValue();
-
- // The type of the truncated inputs.
- if (N0.getOperand(0).getValueType() != VT)
+ // The left side has to be a 'trunc'.
+ bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
+ N0.getOperand(0).getValueType() == VT;
+ if (LHSTrunc)
+ N0 = N0.getOperand(0);
+ else
return SDValue();
-
- N0 = N0.getOperand(0);
}
- if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
+ if (SDValue NN1 =
+ PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
N1 = NN1;
else {
- // The right side has to be a 'trunc' or a (foldable) constant.
+ // The right side has to be a 'trunc', a (foldable) constant or an
+ // existing extension we can extend further.
bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
N1.getOperand(0).getValueType() == VT;
if (RHSTrunc)
N1 = N1.getOperand(0);
+ else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
+ Subtarget.hasInt256() && N1.hasOneUse())
+ N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
else if (SDValue Cst =
DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1}))
N1 = Cst;
@@ -50955,7 +51011,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
EVT NarrowVT = Narrow.getValueType();
// Generate the wide operation.
- SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
+ SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
if (!Op)
return SDValue();
switch (N.getOpcode()) {
@@ -51804,6 +51860,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
SDValue X, Y;
EVT CondVT = VT.changeVectorElementType(MVT::i1);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
+ (VT.is512BitVector() || Subtarget.hasVLX()) &&
+ (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
sd_match(N, m_And(m_Value(X),
m_OneUse(m_SExt(m_AllOf(
m_Value(Y), m_SpecificVT(CondVT),
@@ -54135,10 +54193,10 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG,
const SDLoc &DL) {
assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
- std::optional<uint64_t> ValidSrlConst = DAG.getValidShiftAmount(N);
+ std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
if (!ValidSrlConst)
return SDValue();
- uint64_t SrlConstVal = *ValidSrlConst;
+ unsigned SrlConstVal = *ValidSrlConst;
SDValue Op = N.getOperand(0);
unsigned Opcode = Op.getOpcode();
@@ -55368,6 +55426,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
SDValue Src = N0.getOperand(0);
EVT SrcVT = Src.getValueType();
if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
+ (VT.is512BitVector() || Subtarget.hasVLX()) &&
+ (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
getZeroVector(VT, Subtarget, DAG, DL));
@@ -56247,7 +56307,13 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
SDValue Masked = BroadcastOp;
if (N != 0) {
- APInt Mask = APInt::getLowBitsSet(BroadcastOpVT.getSizeInBits(), Len);
+ unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
+ unsigned NumDefinedElts = UndefElts.countTrailingZeros();
+
+ if (NumDefinedElts > BroadcastOpBitWidth)
+ return SDValue();
+
+ APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
DAG.getConstant(N, DL, BroadcastOpVT));
Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
@@ -57904,6 +57970,51 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
Cmov.getOperand(3));
}
+// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
+// When upper 12 bits of x, y and MUL(x, y) are known to be 0
+static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
+ EVT VT, const X86Subtarget &Subtarget) {
+ using namespace SDPatternMatch;
+ if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
+ (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
+ return SDValue();
+
+ // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
+ if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
+ VT.getSizeInBits() < 512)
+ return SDValue();
+
+ const auto TotalSize = VT.getSizeInBits();
+ if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
+ return SDValue();
+
+ SDValue X, Y, Acc;
+ if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
+ return SDValue();
+
+ KnownBits KnownX = DAG.computeKnownBits(X);
+ if (KnownX.countMinLeadingZeros() < 12)
+ return SDValue();
+ KnownBits KnownY = DAG.computeKnownBits(Y);
+ if (KnownY.countMinLeadingZeros() < 12)
+ return SDValue();
+ KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
+ if (KnownMul.countMinLeadingZeros() < 12)
+ return SDValue();
+
+ auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
+ ArrayRef<SDValue> SubOps) {
+ EVT SubVT = SubOps[0].getValueType();
+ assert(SubVT.getScalarSizeInBits() == 64 &&
+ "Unexpected element size, only supports 64bit size");
+ return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
+ SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
+ };
+
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
+ /*CheckBWI*/ false);
+}
+
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -58007,6 +58118,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
Op0.getOperand(0), Op0.getOperand(2));
}
+ if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
+ return IFMA52;
+
return combineAddOrSubToADCOrSBB(N, DL, DAG);
}
@@ -60068,6 +60182,19 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Simplify VPMADD52L/VPMADD52H operations.
+static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumEltBits = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
+ DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -60705,6 +60832,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
+ case X86ISD::VPMADD52L:
+ case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
case X86ISD::KSHIFTL:
case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
@@ -60932,117 +61061,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
-// Helper to match a string separated by whitespace.
-static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
- S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
-
- for (StringRef Piece : Pieces) {
- if (!S.starts_with(Piece)) // Check if the piece matches.
- return false;
-
- S = S.substr(Piece.size());
- StringRef::size_type Pos = S.find_first_not_of(" \t");
- if (Pos == 0) // We matched a prefix.
- return false;
-
- S = S.substr(Pos);
- }
-
- return S.empty();
-}
-
-static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
-
- if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
- if (llvm::is_contained(AsmPieces, "~{cc}") &&
- llvm::is_contained(AsmPieces, "~{flags}") &&
- llvm::is_contained(AsmPieces, "~{fpsr}")) {
-
- if (AsmPieces.size() == 3)
- return true;
- else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
- return true;
- }
- }
- return false;
-}
-
-bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
- InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
-
- StringRef AsmStr = IA->getAsmString();
-
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
-
- // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
- SmallVector<StringRef, 4> AsmPieces;
- SplitString(AsmStr, AsmPieces, ";\n");
-
- switch (AsmPieces.size()) {
- default: return false;
- case 1:
- // FIXME: this should verify that we are targeting a 486 or better. If not,
- // we will turn this bswap into something that will be lowered to logical
- // ops instead of emitting the bswap asm. For now, we don't support 486 or
- // lower so don't worry about this.
- // bswap $0
- if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
- matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
- matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
- matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
- matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
- matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
- // No need to check constraints, nothing other than the equivalent of
- // "=r,0" would be valid here.
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
-
- // rorw $$8, ${0:w} --> llvm.bswap.i16
- if (CI->getType()->isIntegerTy(16) &&
- IA->getConstraintString().starts_with("=r,0,") &&
- (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
- matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
- AsmPieces.clear();
- StringRef ConstraintsStr = IA->getConstraintString();
- SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
- array_pod_sort(AsmPieces.begin(), AsmPieces.end());
- if (clobbersFlagRegisters(AsmPieces))
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
- break;
- case 3:
- if (CI->getType()->isIntegerTy(32) &&
- IA->getConstraintString().starts_with("=r,0,") &&
- matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
- matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
- matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
- AsmPieces.clear();
- StringRef ConstraintsStr = IA->getConstraintString();
- SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
- array_pod_sort(AsmPieces.begin(), AsmPieces.end());
- if (clobbersFlagRegisters(AsmPieces))
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
-
- if (CI->getType()->isIntegerTy(64)) {
- InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
- if (Constraints.size() >= 2 &&
- Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
- Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
- // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
- if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
- matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
- matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
- }
- break;
- }
- return false;
-}
-
static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
.Case("{@cca}", X86::COND_A)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 97d3b6e2420d..0c9ba591b03e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1364,8 +1364,6 @@ namespace llvm {
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
- bool ExpandInlineAsm(CallInst *CI) const override;
-
ConstraintType getConstraintType(StringRef Constraint) const override;
/// Examine constraint string and operand type and determine a weight value.
@@ -1668,8 +1666,8 @@ namespace llvm {
/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
- ShuffleVectorInst *SVI,
- unsigned Factor) const override;
+ ShuffleVectorInst *SVI, unsigned Factor,
+ const APInt &GapMask) const override;
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
int JTI, SelectionDAG &DAG) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 1c745a338a61..3bc46af4d130 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -302,7 +302,7 @@ EVT X86TargetLowering::getOptimalMemOpType(
if (Op.size() >= 16 &&
(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
// FIXME: Check if unaligned 64-byte accesses are slow.
- if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
+ if (Op.size() >= 64 && Subtarget.hasAVX512() &&
(Subtarget.getPreferVectorWidth() >= 512)) {
return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
}
@@ -416,7 +416,7 @@ bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
return true;
return false;
case 512:
- if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
+ if (Subtarget.hasAVX512())
return true;
return false;
default:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 1beaaafb159e..69a5115201ef 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -550,7 +550,7 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
} // HasAMXMOVRS, In64BitMode
multiclass m_tcvtrowd2ps {
- let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+ let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
(ins TILE:$src1, i32u8imm:$src2),
@@ -561,12 +561,12 @@ multiclass m_tcvtrowd2ps {
"tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, T8,XS, EVEX, VVVV, EVEX_V512;
}
- } // HasAMXAVX512, HasAVX10_2_512, In64BitMode
+ } // HasAMXAVX512, HasAVX10_2, In64BitMode
}
defm TCVTROWD2PS : m_tcvtrowd2ps;
-let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
let usesCustomInserter = 1 in {
def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2),
@@ -630,7 +630,7 @@ let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr,
Prefix P1, Prefix P2> {
- let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode], SchedRW = [WriteSystem] in {
+ let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode], SchedRW = [WriteSystem] in {
let OpPrefix = P1 in
def rre : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst),
(ins TILE:$src1, GR32:$src2),
@@ -658,7 +658,7 @@ defm TCVTROWPS2BF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2bf16h", XD, XD>;
defm TCVTROWPS2BF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2bf16l", XS, XS>;
multiclass m_tilemovrow {
- let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+ let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
(ins TILE:$src1, u8imm:$src2),
@@ -669,12 +669,12 @@ multiclass m_tilemovrow {
"tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, T8,PD, EVEX, VVVV, EVEX_V512;
}
- } // HasAMXAVX512, HasAVX10_2_512, In64BitMode
+ } // HasAMXAVX512, HasAVX10_2, In64BitMode
}
defm TILEMOVROW : m_tilemovrow;
-let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
let usesCustomInserter = 1 in {
def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2),
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 2d2bf1f6c725..764ff998bb56 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -15,36 +15,36 @@
// VNNI FP16
let ExeDomain = SSEPackedSingle in
defm VDPPHPS : avx512_dpf16ps_sizes<0x52, "vdpphps", X86dpfp16ps, avx512vl_f16_info,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T8, PS, EVEX_CD8<32, CD8VF>;
// VNNI INT8
defm VPDPBSSD : VNNI_common<0x50, "vpdpbssd", X86vpdpbssd, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, XD;
+ [HasAVX10_2], [HasAVX10_2]>, XD;
defm VPDPBSSDS : VNNI_common<0x51, "vpdpbssds", X86vpdpbssds, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, XD;
+ [HasAVX10_2], [HasAVX10_2]>, XD;
defm VPDPBSUD : VNNI_common<0x50, "vpdpbsud", X86vpdpbsud, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, XS;
+ [HasAVX10_2], [HasAVX10_2]>, XS;
defm VPDPBSUDS : VNNI_common<0x51, "vpdpbsuds", X86vpdpbsuds, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, XS;
+ [HasAVX10_2], [HasAVX10_2]>, XS;
defm VPDPBUUD : VNNI_common<0x50, "vpdpbuud", X86vpdpbuud, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, PS;
+ [HasAVX10_2], [HasAVX10_2]>, PS;
defm VPDPBUUDS : VNNI_common<0x51, "vpdpbuuds", X86vpdpbuuds, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, PS;
+ [HasAVX10_2], [HasAVX10_2]>, PS;
// VNNI INT16
defm VPDPWSUD : VNNI_common<0xd2, "vpdpwsud", X86vpdpwsud, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, XS;
+ [HasAVX10_2], [HasAVX10_2]>, XS;
defm VPDPWSUDS : VNNI_common<0xd3, "vpdpwsuds", X86vpdpwsuds, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, XS;
+ [HasAVX10_2], [HasAVX10_2]>, XS;
defm VPDPWUSD : VNNI_common<0xd2, "vpdpwusd", X86vpdpwusd, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, PD;
+ [HasAVX10_2], [HasAVX10_2]>, PD;
defm VPDPWUSDS : VNNI_common<0xd3, "vpdpwusds", X86vpdpwusds, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, PD;
+ [HasAVX10_2], [HasAVX10_2]>, PD;
defm VPDPWUUD : VNNI_common<0xd2, "vpdpwuud", X86vpdpwuud, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, PS;
+ [HasAVX10_2], [HasAVX10_2]>, PS;
defm VPDPWUUDS : VNNI_common<0xd3, "vpdpwuuds", X86vpdpwuuds, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, PS;
+ [HasAVX10_2], [HasAVX10_2]>, PS;
// VMPSADBW
defm VMPSADBW : avx512_common_3Op_rm_imm8<0x42, X86Vmpsadbw, "vmpsadbw", SchedWritePSADBW,
@@ -94,9 +94,8 @@ multiclass avx10_minmax_packed_sae<string OpStr, AVX512VLVectorVTInfo VTI, SDNod
}
multiclass avx10_minmax_packed<string OpStr, AVX512VLVectorVTInfo VTI, SDNode OpNode> {
- let Predicates = [HasAVX10_2_512] in
- defm Z : avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512;
let Predicates = [HasAVX10_2] in {
+ defm Z : avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512;
defm Z256 : avx10_minmax_packed_base<OpStr, VTI.info256, OpNode>, EVEX_V256;
defm Z128 : avx10_minmax_packed_base<OpStr, VTI.info128, OpNode>, EVEX_V128;
}
@@ -201,7 +200,7 @@ multiclass avx10_sat_cvt_rmb<bits<8> Opc, string OpStr, X86FoldableSchedWrite sc
multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo,
SDNode MaskNode> {
- let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in
+ let Predicates = [HasAVX10_2], Uses = [MXCSR] in
defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512,
(outs DestInfo.info512.RC:$dst),
(ins SrcInfo.info512.RC:$src, AVX512RC:$rc),
@@ -216,7 +215,7 @@ multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched
multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo,
SDNode Node> {
- let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in
+ let Predicates = [HasAVX10_2], Uses = [MXCSR] in
defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512,
(outs DestInfo.info512.RC:$dst),
(ins SrcInfo.info512.RC:$src),
@@ -229,12 +228,11 @@ multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sche
multiclass avx10_sat_cvt_base<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
SDNode MaskNode, AVX512VLVectorVTInfo DestInfo,
AVX512VLVectorVTInfo SrcInfo> {
- let Predicates = [HasAVX10_2_512] in
- defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM,
- DestInfo.info512, SrcInfo.info512,
- MaskNode>,
- EVEX, EVEX_V512;
let Predicates = [HasAVX10_2] in {
+ defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM,
+ DestInfo.info512, SrcInfo.info512,
+ MaskNode>,
+ EVEX, EVEX_V512;
defm Z256
: avx10_sat_cvt_rmb<Opc, OpStr, sched.YMM,
DestInfo.info256, SrcInfo.info256,
@@ -334,13 +332,11 @@ defm VCVTTPS2IUBS : avx10_sat_cvt_base<0x6a, "vcvttps2iubs", SchedWriteVecIMul,
multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeSAE,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
OpNodeSAE, sched.ZMM>, EVEX_V512;
- }
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
f128mem, VK2WM>, EVEX_V128;
@@ -410,13 +406,11 @@ multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
- }
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
@@ -432,13 +426,11 @@ multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
- }
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM,
(v2i64 (OpNode (bc_v4f32 (v2f64
@@ -460,14 +452,11 @@ multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
multiclass avx10_cvttps2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode,
SDNode OpNodeSAE, X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
OpNodeSAE, sched.ZMM>, EVEX_V512;
- }
-
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
@@ -719,7 +708,7 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _SrcVTInfo,
AVX512VLVectorVTInfo _DstVTInfo,
SDNode OpNode, SDNode OpNodeRnd> {
- let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode,
_SrcVTInfo.info512, _DstVTInfo.info512,
_SrcVTInfo.info512>,
@@ -727,8 +716,6 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr,
_SrcVTInfo.info512, _DstVTInfo.info512,
OpNodeRnd>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
- }
- let Predicates = [HasAVX10_2] in {
defm Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode,
_SrcVTInfo.info256, _DstVTInfo.info256,
_SrcVTInfo.info256>,
@@ -747,19 +734,19 @@ defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx",
defm VCVT2PH2BF8 : avx512_binop_all<0x74, "vcvt2ph2bf8", SchedWriteCvtPD2PS,
avx512vl_f16_info, avx512vl_i8_info,
- X86vcvt2ph2bf8, [HasAVX10_2_512], [HasAVX10_2]>,
+ X86vcvt2ph2bf8, [HasAVX10_2], [HasAVX10_2]>,
EVEX_CD8<16, CD8VF>, T8, XD;
defm VCVT2PH2BF8S : avx512_binop_all<0x74, "vcvt2ph2bf8s", SchedWriteCvtPD2PS,
avx512vl_f16_info, avx512vl_i8_info,
- X86vcvt2ph2bf8s, [HasAVX10_2_512], [HasAVX10_2]>,
+ X86vcvt2ph2bf8s, [HasAVX10_2], [HasAVX10_2]>,
EVEX_CD8<16, CD8VF>, T_MAP5, XD;
defm VCVT2PH2HF8 : avx512_binop_all<0x18, "vcvt2ph2hf8", SchedWriteCvtPD2PS,
avx512vl_f16_info, avx512vl_i8_info,
- X86vcvt2ph2hf8, [HasAVX10_2_512], [HasAVX10_2]>,
+ X86vcvt2ph2hf8, [HasAVX10_2], [HasAVX10_2]>,
EVEX_CD8<16, CD8VF>, T_MAP5, XD;
defm VCVT2PH2HF8S : avx512_binop_all<0x1b, "vcvt2ph2hf8s", SchedWriteCvtPD2PS,
avx512vl_f16_info, avx512vl_i8_info,
- X86vcvt2ph2hf8s, [HasAVX10_2_512], [HasAVX10_2]>,
+ X86vcvt2ph2hf8s, [HasAVX10_2], [HasAVX10_2]>,
EVEX_CD8<16, CD8VF>, T_MAP5, XD;
//TODO: Merge into avx512_vcvt_fp, diffrence is one more source register here.
@@ -836,11 +823,10 @@ multiclass avx10_convert_3op<bits<8> OpCode, string OpcodeStr,
PatFrag bcast128 = vt_src.info128.BroadcastLdFrag,
PatFrag loadVT128 = vt_src.info128.LdFrag,
RegisterClass maskRC128 = vt_src.info128.KRCWM> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info256,
vt_dst.info512, vt_src.info512, OpNode, OpNode, sched.ZMM>,
EVEX_V512, EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z256 : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info128,
vt_dst.info256, vt_src.info256, OpNode, OpNode, sched.YMM>,
EVEX_V256, EVEX_CD8<16, CD8VF>;
@@ -920,25 +906,25 @@ defm VCVTBIASPH2HF8S : avx10_convert_3op<0x1b, "vcvtbiasph2hf8s",
defm VCVTPH2BF8 : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8", avx512vl_i8_info,
avx512vl_f16_info, SchedWriteCvtPD2PS,
X86vcvtph2bf8, X86vmcvtph2bf8,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T8, XS, EVEX_CD8<16, CD8VF>;
defm VCVTPH2BF8S : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8s", avx512vl_i8_info,
avx512vl_f16_info, SchedWriteCvtPD2PS,
X86vcvtph2bf8s, X86vmcvtph2bf8s,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T_MAP5, XS, EVEX_CD8<16, CD8VF>;
defm VCVTPH2HF8 : avx512_cvt_trunc_ne<0x18, "vcvtph2hf8", avx512vl_i8_info,
avx512vl_f16_info, SchedWriteCvtPD2PS,
X86vcvtph2hf8, X86vmcvtph2hf8,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T_MAP5, XS, EVEX_CD8<16, CD8VF>;
defm VCVTPH2HF8S : avx512_cvt_trunc_ne<0x1b, "vcvtph2hf8s", avx512vl_i8_info,
avx512vl_f16_info, SchedWriteCvtPD2PS,
X86vcvtph2hf8s, X86vmcvtph2hf8s,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T_MAP5, XS, EVEX_CD8<16, CD8VF>;
multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr,
@@ -962,10 +948,9 @@ multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr,
multiclass avx10_convert_2op_nomb<string OpcodeStr, AVX512VLVectorVTInfo _dest,
AVX512VLVectorVTInfo _src, bits<8> opc, SDNode OpNode> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info512, _src.info256,
OpNode, f256mem, WriteCvtPH2PSZ>, EVEX_V512;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info128, _src.info128,
OpNode, f64mem, WriteCvtPH2PSZ>, EVEX_V128;
defm Z256 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info256, _src.info128,
@@ -985,13 +970,12 @@ defm VCVTHF82PH : avx10_convert_2op_nomb<"vcvthf82ph", avx512vl_f16_info,
multiclass avx10_fp_binop_int_bf16<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched,
bit IsCommutable = 0> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fp_packed<opc, OpcodeStr,
!cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"),
!cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"),
v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512,
T_MAP5, PD, EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fp_packed<opc, OpcodeStr,
!cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"),
!cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"),
@@ -1009,11 +993,10 @@ multiclass avx10_fp_binop_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator
X86SchedWriteSizes sched,
bit IsCommutable = 0,
SDPatternOperator MaskOpNode = OpNode> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode,
v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512,
T_MAP5, PD, EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode,
v8bf16x_info, sched.PH.XMM, IsCommutable>, EVEX_V128,
T_MAP5, PD, EVEX_CD8<16, CD8VF>;
@@ -1086,9 +1069,8 @@ multiclass avx10_vcmp_common_bf16<X86FoldableSchedWrite sched, X86VectorVTInfo _
}
multiclass avx10_vcmp_bf16<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
- let Predicates = [HasAVX10_2_512] in
- defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512;
let Predicates = [HasAVX10_2] in {
+ defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512;
defm Z128 : avx10_vcmp_common_bf16<sched.XMM, _.info128>, EVEX_V128;
defm Z256 : avx10_vcmp_common_bf16<sched.YMM, _.info256>, EVEX_V256;
}
@@ -1102,11 +1084,10 @@ defm VCMPBF16 : avx10_vcmp_bf16<SchedWriteFCmp, avx512vl_bf16_info>,
// VSQRTBF16
multiclass avx10_sqrt_packed_bf16<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched> {
- let Predicates = [HasAVX10_2_512] in
- defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"),
- sched.PH.ZMM, v32bf16_info>,
- EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
let Predicates = [HasAVX10_2] in {
+ defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"),
+ sched.PH.ZMM, v32bf16_info>,
+ EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
defm Z128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"),
sched.PH.XMM, v8bf16x_info>,
EVEX_V128, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
@@ -1122,11 +1103,10 @@ defm VSQRTBF16 : avx10_sqrt_packed_bf16<0x51, "vsqrt", SchedWriteFSqrtSizes>;
// VRSQRTBF16, VRCPBF16, VSRQTBF16, VGETEXPBF16
multiclass avx10_fp14_bf16<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
- defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"),
- OpNode, sched.ZMM, v32bf16_info>,
- EVEX_V512;
let Predicates = [HasAVX10_2] in {
+ defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"),
+ OpNode, sched.ZMM, v32bf16_info>,
+ EVEX_V512;
defm BF16Z128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"),
OpNode, sched.XMM, v8bf16x_info>,
EVEX_V128;
@@ -1146,10 +1126,9 @@ defm VGETEXP : avx10_fp14_bf16<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>,
// VSCALEFBF16
multiclass avx10_fp_scalef_bf16<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32bf16_info>,
EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8bf16x_info>,
EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6, PS;
defm Z256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16bf16x_info>,
@@ -1164,10 +1143,9 @@ defm VSCALEFBF16 : avx10_fp_scalef_bf16<0x2C, "vscalef", SchedWriteFAdd>;
multiclass avx10_common_unary_fp_packed_imm_bf16<string OpcodeStr,
AVX512VLVectorVTInfo _, bits<8> opc, SDPatternOperator OpNode,
SDPatternOperator MaskOpNode, X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, _.info512>, EVEX_V512;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.XMM, _.info128>, EVEX_V128;
defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
@@ -1190,11 +1168,10 @@ defm VGETMANTBF16 : avx10_common_unary_fp_packed_imm_bf16<"vgetmant", avx512vl_b
// VFPCLASSBF16
multiclass avx10_fp_fpclass_bf16<string OpcodeStr, bits<8> opcVec,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vector_fpclass<opcVec, OpcodeStr, sched.ZMM,
avx512vl_bf16_info.info512, "z",
[]<Register>>, EVEX_V512;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vector_fpclass<opcVec, OpcodeStr, sched.XMM,
avx512vl_bf16_info.info128, "x",
[]<Register>>, EVEX_V128;
@@ -1211,11 +1188,10 @@ defm VFPCLASSBF16 : avx10_fp_fpclass_bf16<"vfpclass", 0x66, SchedWriteFCmp>,
multiclass avx10_fma3p_213_bf16<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode, SDNode MaskOpNode,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
@@ -1239,11 +1215,10 @@ defm VFNMSUB213BF16 : avx10_fma3p_213_bf16<0xAE, "vfnmsub213bf16", X86any_Fnmsub
multiclass avx10_fma3p_231_bf16<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode, SDNode MaskOpNode,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
@@ -1267,11 +1242,10 @@ defm VFNMSUB231BF16 : avx10_fma3p_231_bf16<0xBE, "vfnmsub231bf16", X86any_Fnmsub
multiclass avx10_fma3p_132_bf16<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode, SDNode MaskOpNode,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
@@ -1440,9 +1414,8 @@ multiclass vmovrs_p<bits<8> opc, string OpStr, X86VectorVTInfo _> {
}
multiclass vmovrs_p_vl<bits<8> opc, string OpStr, AVX512VLVectorVTInfo _Vec> {
- let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in
- defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512;
let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in {
+ defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512;
defm Z128 : vmovrs_p<opc, OpStr, _Vec.info128>, EVEX_V128;
defm Z256 : vmovrs_p<opc, OpStr, _Vec.info256>, EVEX_V256;
}
@@ -1464,7 +1437,7 @@ multiclass avx10_sm4_base<string OpStr> {
defm Z128 : SM4_Base<OpStr, VR128X, "128", loadv4i32, i128mem>, EVEX_V128;
defm Z256 : SM4_Base<OpStr, VR256X, "256", loadv8i32, i256mem>, EVEX_V256;
}
- let Predicates = [HasSM4, HasAVX10_2_512] in
+ let Predicates = [HasSM4, HasAVX10_2] in
defm Z : SM4_Base<OpStr, VR512, "512", loadv16i32, i512mem>, EVEX_V512;
}
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 0ab94cca4142..3401f6f04800 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -834,7 +834,7 @@ defm : vextract_for_size_lowering<"VEXTRACTF64X4Z", v32bf16_info, v16bf16x_info,
// A 128-bit extract from bits [255:128] of a 512-bit vector should use a
// smaller extract to enable EVEX->VEX.
-let Predicates = [NoVLX, HasEVEX512] in {
+let Predicates = [NoVLX] in {
def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
(v2i64 (VEXTRACTI128rri
(v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
@@ -3088,7 +3088,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
}
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
@@ -3119,7 +3119,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>;
}
-let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
+let Predicates = [HasBWI, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
@@ -3513,7 +3513,7 @@ multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow,
// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
// available. Use a 512-bit operation and extract.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>;
defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>;
defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>;
@@ -3525,7 +3525,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>;
}
-let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
+let Predicates = [HasBWI, NoVLX] in {
defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>;
defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>;
@@ -5021,8 +5021,8 @@ defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
SchedWriteVecALU, HasAVX512, 1>, T8;
-// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX, HasEVEX512.
-let Predicates = [HasDQI, NoVLX, HasEVEX512] in {
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasDQI, NoVLX] in {
def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
(EXTRACT_SUBREG
(VPMULLQZrr
@@ -5078,7 +5078,7 @@ multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> {
sub_xmm)>;
}
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
defm : avx512_min_max_lowering<"VPMAXUQZ", umax>;
defm : avx512_min_max_lowering<"VPMINUQZ", umin>;
defm : avx512_min_max_lowering<"VPMAXSQZ", smax>;
@@ -6055,7 +6055,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,
SchedWriteVecShift>;
// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPSRAQZrr
@@ -6184,14 +6184,14 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecS
defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
-defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX, HasEVEX512]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX, HasEVEX512]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX, HasEVEX512]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX, HasEVEX512]>;
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPROLVQZrr
@@ -6242,7 +6242,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
}
// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPRORVQZrr
@@ -9933,7 +9933,7 @@ defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
truncstore_us_vi8, masked_truncstore_us_vi8,
X86vtruncus, X86vmtruncus>;
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))),
(v8i16 (EXTRACT_SUBREG
(v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
@@ -9944,7 +9944,7 @@ def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))),
VR256X:$src, sub_ymm)))), sub_xmm))>;
}
-let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
+let Predicates = [HasBWI, NoVLX] in {
def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
(v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src, sub_ymm))), sub_xmm))>;
@@ -10487,7 +10487,7 @@ multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
EVEX_V128;
}
- let Predicates = [prd, NoVLX, HasEVEX512] in {
+ let Predicates = [prd, NoVLX] in {
defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>;
defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>;
}
@@ -11283,7 +11283,7 @@ defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs,
SchedWriteVecALU>;
// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v4i64 (abs VR256X:$src)),
(EXTRACT_SUBREG
(VPABSQZrr
@@ -11299,7 +11299,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
// Use 512bit version to implement 128/256 bit.
multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
AVX512VLVectorVTInfo _, Predicate prd> {
- let Predicates = [prd, NoVLX, HasEVEX512] in {
+ let Predicates = [prd, NoVLX] in {
def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))),
(EXTRACT_SUBREG
(!cast<Instruction>(InstrStr # "Zrr")
@@ -11918,7 +11918,7 @@ let Predicates = [HasAVX512] in {
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
}
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v16i8 (vnot VR128X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 927b2c8b22f0..5a0df058b27f 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1326,7 +1326,11 @@ def : Pat<(X86imp_call (i64 tglobaladdr:$dst)),
// Match an X86tcret that uses less than 7 volatile registers.
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
- Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
+ Requires<[Not64BitMode, IsNotHiPECCFunc, NotUseIndirectThunkCalls]>;
+
+def : Pat<(X86tcret GR32:$dst, timm:$off),
+ (TCRETURN_HIPE32ri GR32:$dst, timm:$off)>,
+ Requires<[Not64BitMode, IsHiPECCFunc, NotUseIndirectThunkCalls]>;
// FIXME: This is disabled for 32-bit PIC mode because the global base
// register which is part of the address mode may be assigned a
@@ -1346,7 +1350,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
- Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+ Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+
+def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off),
+ (TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>,
+ Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>,
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index 22253bf0413a..139aedd473eb 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -282,6 +282,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
[]>, Sched<[WriteJump]>;
def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
+
+ def TCRETURN_HIPE32ri : PseudoI<(outs), (ins GR32:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>;
+
let mayLoad = 1 in
def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
[]>, Sched<[WriteJumpLd]>;
@@ -357,6 +361,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
def TCRETURNri64 : PseudoI<(outs),
(ins ptr_rc_tailcall:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
+ def TCRETURN_WIN64ri : PseudoI<(outs), (ins GR64_TCW64:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>;
+
def TCRETURNri64_ImpCall : PseudoI<(outs),
(ins GR64_A:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index abf365eedec3..a68edf4d2b7e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -83,8 +83,9 @@ static cl::opt<unsigned> UndefRegClearance(
// Pin the vtable to this file.
void X86InstrInfo::anchor() {}
-X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
- : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
+ : X86GenInstrInfo(STI,
+ (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
: X86::ADJCALLSTACKDOWN32),
(STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
: X86::ADJCALLSTACKUP32),
@@ -4399,13 +4400,8 @@ static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
if (STI.hasFP16())
return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
if (Load)
- return STI.hasAVX512() ? X86::VMOVSSZrm
- : STI.hasAVX() ? X86::VMOVSSrm
- : X86::MOVSSrm;
- else
- return STI.hasAVX512() ? X86::VMOVSSZmr
- : STI.hasAVX() ? X86::VMOVSSmr
- : X86::MOVSSmr;
+ return X86::MOVSHPrm;
+ return X86::MOVSHPmr;
}
static unsigned getLoadStoreRegOpcode(Register Reg,
@@ -4903,6 +4899,16 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
CmpMask = ~0;
CmpValue = 0;
return true;
+ case X86::TEST64ri32:
+ case X86::TEST32ri:
+ case X86::TEST16ri:
+ case X86::TEST8ri:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = 0;
+ // Force identical compare.
+ CmpMask = 0;
+ CmpValue = 0;
+ return true;
}
return false;
}
@@ -4942,6 +4948,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
case X86::CMP32ri:
case X86::CMP16ri:
case X86::CMP8ri:
+ case X86::TEST64ri32:
+ case X86::TEST32ri:
+ case X86::TEST16ri:
+ case X86::TEST8ri:
CASE_ND(SUB64ri32)
CASE_ND(SUB32ri)
CASE_ND(SUB16ri)
@@ -6131,6 +6141,25 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
return true;
}
+static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI,
+ const TargetInstrInfo &TII, bool HasAVX) {
+ unsigned NewOpc;
+ if (MI.getOpcode() == X86::MOVSHPrm) {
+ NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
+ Register Reg = MI.getOperand(0).getReg();
+ if (Reg > X86::XMM15)
+ NewOpc = X86::VMOVSSZrm;
+ } else {
+ NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+ Register Reg = MI.getOperand(5).getReg();
+ if (Reg > X86::XMM15)
+ NewOpc = X86::VMOVSSZmr;
+ }
+
+ MIB->setDesc(TII.get(NewOpc));
+ return true;
+}
+
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -6203,6 +6232,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
}
+ case X86::MOVSHPmr:
+ case X86::MOVSHPrm:
+ return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
case X86::V_SETALLONES:
return Expand2AddrUndef(MIB,
get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 9dc5f4b0e086..f087b7f20ff6 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -222,7 +222,7 @@ inline static bool isMemInstrWithGOTPCREL(const MachineInstr &MI) {
}
class X86InstrInfo final : public X86GenInstrInfo {
- X86Subtarget &Subtarget;
+ const X86Subtarget &Subtarget;
const X86RegisterInfo RI;
LLVM_DECLARE_VIRTUAL_ANCHOR_FUNCTION();
@@ -238,7 +238,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
bool MakeChange) const;
public:
- explicit X86InstrInfo(X86Subtarget &STI);
+ explicit X86InstrInfo(const X86Subtarget &STI);
/// Given a machine instruction descriptor, returns the register
/// class constraint for OpNum, or NULL. Returned register class
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index df1541e9085b..8339c2081842 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -69,11 +69,8 @@ def NoAVX : Predicate<"!Subtarget->hasAVX()">;
def HasAVX : Predicate<"Subtarget->hasAVX()">;
def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
-def HasEVEX512 : Predicate<"Subtarget->hasEVEX512()">;
def HasAVX10_1 : Predicate<"Subtarget->hasAVX10_1()">;
-def HasAVX10_1_512 : Predicate<"Subtarget->hasAVX10_1_512()">;
def HasAVX10_2 : Predicate<"Subtarget->hasAVX10_2()">;
-def HasAVX10_2_512 : Predicate<"Subtarget->hasAVX10_2_512()">;
def NoAVX10_2 : Predicate<"!Subtarget->hasAVX10_2()">;
def HasAVX512 : Predicate<"Subtarget->hasAVX512()">;
def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
@@ -233,6 +230,13 @@ let RecomputePerFunction = 1 in {
"!Subtarget->hasSSE41()">;
def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
+
+ def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
+ def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
+ def IsHiPECCFunc : Predicate<"MF->getFunction().getCallingConv() == CallingConv::HiPE">;
+
+ def IsNotHiPECCFunc : Predicate<
+ "MF->getFunction().getCallingConv() != CallingConv::HiPE">;
}
def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 1acc0cd8da20..b7926497c92b 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -267,6 +267,18 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
}
}
+// pseudo instruction for fp16 spilling.
+let isPseudo = 1, Predicates = [HasSSE2] in {
+ let mayStore = 1 in
+ def MOVSHPmr : I<0, Pseudo, (outs), (ins f32mem:$dst, FR16X:$src), "",
+ [], SSEPackedSingle>,
+ Sched<[WriteFStore]>;
+ let mayLoad = 1 in
+ def MOVSHPrm : I<0, Pseudo, (outs FR16X:$dst), (ins f32mem:$src), "",
+ [], SSEPackedSingle>,
+ Sched<[WriteFLoad]>;
+}
+
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
SSEPackedSingle, UseSSE1>, TB, XS;
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 632db7e4326e..4188487d7591 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -825,7 +825,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
- unsigned Factor) const {
+ unsigned Factor,
+ const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -836,7 +837,8 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
- assert(!LaneMask && "Unexpected mask on store");
+ assert(!LaneMask && GapMask.popcount() == Factor &&
+ "Unexpected mask on store");
// Holds the indices of SVI that correspond to the starting index of each
// interleaved shuffle.
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 595ad3290eed..9ec04e740a08 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -204,15 +204,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
// we can still use 64-bit register as long as we know the high bits
// are zeros.
// Reflect that in the returned register class.
- if (Is64Bit) {
- // When the target also allows 64-bit frame pointer and we do have a
- // frame, this is fine to use it for the address accesses as well.
- const X86FrameLowering *TFI = getFrameLowering(MF);
- return TFI->hasFP(MF) && TFI->Uses64BitFramePtr
- ? &X86::LOW32_ADDR_ACCESS_RBPRegClass
- : &X86::LOW32_ADDR_ACCESSRegClass;
- }
- return &X86::GR32RegClass;
+ return Is64Bit ? &X86::LOW32_ADDR_ACCESSRegClass : &X86::GR32RegClass;
case 1: // Normal GPRs except the stack pointer (for encoding reasons).
if (Subtarget.isTarget64BitLP64())
return &X86::GR64_NOSPRegClass;
@@ -228,25 +220,11 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
// NOSP does not contain RIP, so no special case here.
return &X86::GR32_NOREX_NOSPRegClass;
case 4: // Available for tailcall (not callee-saved GPRs).
- return getGPRsForTailCall(MF);
+ return Is64Bit ? &X86::GR64_TCRegClass : &X86::GR32_TCRegClass;
}
}
const TargetRegisterClass *
-X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
- const Function &F = MF.getFunction();
- if (IsWin64 || IsUEFI64 || (F.getCallingConv() == CallingConv::Win64))
- return &X86::GR64_TCW64RegClass;
- else if (Is64Bit)
- return &X86::GR64_TCRegClass;
-
- bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE);
- if (hasHipeCC)
- return &X86::GR32RegClass;
- return &X86::GR32_TCRegClass;
-}
-
-const TargetRegisterClass *
X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
if (RC == &X86::CCRRegClass) {
if (Is64Bit)
@@ -1007,11 +985,10 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
unsigned X86RegisterInfo::findDeadCallerSavedReg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const {
const MachineFunction *MF = MBB.getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
if (MF->callsEHReturn())
return 0;
- const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF);
-
if (MBBI == MBB.end())
return 0;
@@ -1026,6 +1003,8 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
case X86::RETI64:
case X86::TCRETURNdi:
case X86::TCRETURNri:
+ case X86::TCRETURN_WIN64ri:
+ case X86::TCRETURN_HIPE32ri:
case X86::TCRETURNmi:
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
@@ -1033,20 +1012,16 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
case X86::TCRETURNmi64:
case X86::EH_RETURN:
case X86::EH_RETURN64: {
- SmallSet<uint16_t, 8> Uses;
- for (MachineOperand &MO : MBBI->operands()) {
- if (!MO.isReg() || MO.isDef())
- continue;
- Register Reg = MO.getReg();
- if (!Reg)
- continue;
- for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
- Uses.insert(*AI);
+ LiveRegUnits LRU(*this);
+ LRU.addLiveOuts(MBB);
+ LRU.stepBackward(*MBBI);
+
+ const TargetRegisterClass &RC =
+ Is64Bit ? X86::GR64_NOSPRegClass : X86::GR32_NOSPRegClass;
+ for (MCRegister Reg : RC) {
+ if (LRU.available(Reg) && !MRI.isReserved(Reg))
+ return Reg;
}
-
- for (auto CS : AvailableRegs)
- if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP)
- return CS;
}
}
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 2f4c55cfad6d..d022e5ab8794 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -87,11 +87,6 @@ public:
const TargetRegisterClass *
getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
- /// getGPRsForTailCall - Returns a register class with registers that can be
- /// used in forming tail calls.
- const TargetRegisterClass *
- getGPRsForTailCall(const MachineFunction &MF) const;
-
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index e9ca25d808a5..99b7910131dc 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -716,10 +716,7 @@ def GR64_NOREX2_NOSP : RegisterClass<"X86", [i64], 64,
// which we do not have right now.
def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
-// When RBP is used as a base pointer in a 32-bit addresses environment,
-// this is also safe to use the full register to access addresses.
-// Since RBP will never be spilled, stick to a 32 alignment to save
-// on memory consumption.
+// FIXME: This is unused, but deleting it results in codegen changes
def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32,
(add LOW32_ADDR_ACCESS, RBP)>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index 9e271c1ee370..044b77f7aacf 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -992,14 +992,14 @@ def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>;
def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
}
def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>;
def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
}
@@ -1221,7 +1221,7 @@ def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG1rr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
}
@@ -1235,7 +1235,7 @@ def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
}
@@ -1249,7 +1249,7 @@ def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG1rr.Latency);
let ReleaseAtCycles = [1, 1, 3];
let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
}
@@ -1263,7 +1263,7 @@ def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG2rr.Latency);
let ReleaseAtCycles = [1, 1, 8];
let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
}
@@ -1338,14 +1338,14 @@ def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>;
def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
}
def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
- let Latency = !add(Znver3Model.LoadLatency, 7);
+ let Latency = !add(Znver3Model.VecLoadLatency, 7);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = 3;
}
@@ -1359,14 +1359,14 @@ def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERMYri.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
}
def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
def Zn3WriteVPERMDYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
- let Latency = !add(Znver3Model.LoadLatency, 5);
+ let Latency = !add(Znver3Model.VecLoadLatency, 5);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = 2;
}
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 74d916d41f83..a93c7e3a82f1 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1005,14 +1005,14 @@ def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>;
def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
}
def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>;
def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
}
@@ -1262,7 +1262,7 @@ def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG1rr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
}
@@ -1276,7 +1276,7 @@ def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
}
@@ -1290,7 +1290,7 @@ def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG1rr.Latency);
let ReleaseAtCycles = [1, 1, 3];
let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
}
@@ -1304,7 +1304,7 @@ def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG2rr.Latency);
let ReleaseAtCycles = [1, 1, 8];
let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
}
@@ -1379,7 +1379,7 @@ def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>;
def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
}
@@ -1393,7 +1393,7 @@ def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
}
@@ -1407,7 +1407,7 @@ def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
}
@@ -1421,7 +1421,7 @@ def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
}
@@ -1534,9 +1534,9 @@ def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
- "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
+ "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
"VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
- "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
+ "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
)>;
// SCALE & REDUCE instructions
@@ -1567,7 +1567,7 @@ def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
- "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
+ "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
"VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
)>;
@@ -1586,7 +1586,7 @@ def : InstRW<[Zn4WriteSHIFTrr], (instregex
"(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
"(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
"(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
- "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
+ "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
)>;
def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
@@ -1598,24 +1598,40 @@ def : InstRW<[Zn4WriteSHIFTri], (instregex
"VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
)>;
-// ALIGN Instructions
-def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
+// ALIGNR Instructions
+def Zn4WriteALIGNR: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteALIGNR], (instregex
+ "(V?)PALIGNR(Y?|Z128?|Z256?)(rri|rrik|rrikz)"
+ )>;
+def Zn4WriteALIGNRZ: SchedWriteRes<[Zn4FPFMisc12]> {
let Latency = 2;
let ReleaseAtCycles = [2];
let NumMicroOps = 1;
}
-def : InstRW<[Zn4WriteALIGN], (instregex
- "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
+def : InstRW<[Zn4WriteALIGNRZ], (instregex
+ "(V?)PALIGNRZ(rri|rrik|rrikz)"
)>;
-//PACK Instructions
+// PACK Instructions
def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
let Latency = 2;
- let ReleaseAtCycles = [2];
+ let ReleaseAtCycles = [1];
let NumMicroOps = 1;
}
def : InstRW<[Zn4WritePACK], (instregex
- "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
+ "(V?)PACK(SS|US)(DW|WB)(Y?|Z128?|Z256?)(rr|rrk|rrkz)"
+ )>;
+def Zn4WritePACKZ: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ReleaseAtCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WritePACKZ], (instregex
+ "(V?)PACK(SS|US)(DW|WB)Z(rr|rrk|rrkz)"
)>;
// MAX and MIN Instructions
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 8ad8d423d10c..fd5f34b60efb 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -261,26 +261,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
if (!FS.empty())
FullFS = (Twine(FullFS) + "," + FS).str();
- // Attach EVEX512 feature when we have AVX512 features with a default CPU.
- // "pentium4" is default CPU for 32-bit targets.
- // "x86-64" is default CPU for 64-bit targets.
- if (CPU == "generic" || CPU == "pentium4" || CPU == "x86-64") {
- size_t posNoEVEX512 = FS.rfind("-evex512");
- // Make sure we won't be cheated by "-avx512fp16".
- size_t posNoAVX512F =
- FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,");
- size_t posEVEX512 = FS.rfind("+evex512");
- // Any AVX512XXX will enable AVX512F.
- size_t posAVX512F = FS.rfind("+avx512");
-
- if (posAVX512F != StringRef::npos &&
- (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F))
- if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos)
- FullFS += ",+evex512";
- }
-
// Disable 64-bit only features in non-64-bit mode.
- SmallVector<StringRef, 9> FeaturesIn64BitOnly = {
+ StringRef FeaturesIn64BitOnly[] = {
"egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"};
if (FullFS.find("-64bit-mode") != std::string::npos)
for (StringRef F : FeaturesIn64BitOnly)
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index be49214e041e..fa3f3b59741d 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -226,8 +226,7 @@ public:
// TODO: Currently we're always allowing widening on CPUs without VLX,
// because for many cases we don't have a better option.
bool canExtendTo512DQ() const {
- return hasAVX512() && hasEVEX512() &&
- (!hasVLX() || getPreferVectorWidth() >= 512);
+ return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
}
bool canExtendTo512BW() const {
return hasBWI() && canExtendTo512DQ();
@@ -247,8 +246,7 @@ public:
// If there are no 512-bit vectors and we prefer not to use 512-bit registers,
// disable them in the legalizer.
bool useAVX512Regs() const {
- return hasAVX512() && hasEVEX512() &&
- (canExtendTo512DQ() || RequiredVectorWidth > 256);
+ return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
}
bool useLight256BitInstructions() const {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 62f95277d016..3d8d0a236a3c 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -213,7 +213,7 @@ X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
case TargetTransformInfo::RGK_FixedWidthVector:
- if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
+ if (ST->hasAVX512() && PreferVectorWidth >= 512)
return TypeSize::getFixed(512);
if (ST->hasAVX() && PreferVectorWidth >= 256)
return TypeSize::getFixed(256);
@@ -1206,6 +1206,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
{ ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
+ { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
+
{ ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
{ ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
{ ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
@@ -6591,7 +6593,7 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
// Only enable vector loads for equality comparison. Right now the vector
// version is not as fast for three way compare (see #33329).
const unsigned PreferredWidth = ST->getPreferVectorWidth();
- if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
+ if (PreferredWidth >= 512 && ST->hasAVX512())
Options.LoadSizes.push_back(64);
if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
index ea8b88f41bb8..9bf0abb018c9 100644
--- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
+++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
@@ -105,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
// Prolog information.
SmallVector<int64_t> PushedRegs;
bool HasStackAlloc = false;
+ bool HasSetFrame = false;
unsigned ApproximatePrologCodeCount = 0;
// Requested changes.
@@ -130,15 +131,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
break;
case X86::SEH_StackAlloc:
- case X86::SEH_SetFrame:
if (State != FunctionState::InProlog)
- llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog");
+ llvm_unreachable("SEH_StackAlloc outside of prolog");
// Assume a large alloc...
- ApproximatePrologCodeCount +=
- (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1;
+ ApproximatePrologCodeCount += 3;
HasStackAlloc = true;
break;
+ case X86::SEH_SetFrame:
+ if (State != FunctionState::InProlog)
+ llvm_unreachable("SEH_SetFrame outside of prolog");
+ ApproximatePrologCodeCount++;
+ HasSetFrame = true;
+ break;
+
case X86::SEH_SaveReg:
case X86::SEH_SaveXMM:
if (State != FunctionState::InProlog)
@@ -190,8 +196,30 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
State = FunctionState::FinishedEpilog;
break;
- case X86::LEA64r:
case X86::MOV64rr:
+ if (State == FunctionState::InEpilog) {
+ // If the prolog contains a stack allocation, then the first
+ // instruction in the epilog must be to adjust the stack pointer.
+ if (!HasSetFrame)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is setting frame back, but prolog did not set it");
+ if (PoppedRegCount > 0)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is setting the frame back after popping "
+ "registers");
+ if (HasStackDealloc)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "Cannot set the frame back after the stack "
+ "allocation has been deallocated");
+ } else if (State == FunctionState::FinishedEpilog)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode, "Unexpected mov instruction after the epilog");
+ break;
+
+ case X86::LEA64r:
case X86::ADD64ri32:
if (State == FunctionState::InEpilog) {
// If the prolog contains a stack allocation, then the first
@@ -211,8 +239,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
HasStackDealloc = true;
} else if (State == FunctionState::FinishedEpilog)
return rejectCurrentFunctionInternalError(
- MF, Mode,
- "Unexpected lea, mov or add instruction after the epilog");
+ MF, Mode, "Unexpected lea or add instruction after the epilog");
break;
case X86::POP64r:
@@ -278,11 +305,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (UnwindV2StartLocations.empty()) {
- assert(State == FunctionState::InProlog &&
- "If there are no epilogs, then there should be no prolog");
+ if (UnwindV2StartLocations.empty())
return false;
- }
MachineBasicBlock &FirstMBB = MF.front();
// Assume +1 for the "header" UOP_Epilog that contains the epilog size, and
diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 6921f44b700c..096ad08d8a3c 100644
--- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -71,113 +71,11 @@ static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
- return *(RegInfo->getRegClass(RC).begin() + RegNo);
+ return RegInfo->getRegClass(RC).getRegister(RegNo);
}
static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RImmInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeR2RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeRUSInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL2RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeLR2RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL6RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL5RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
- const MCDisassembler *Decoder);
-
-#include "XCoreGenDisassemblerTables.inc"
-
-static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t Address,
const MCDisassembler *Decoder) {
if (RegNo > 11)
return MCDisassembler::Fail;
@@ -249,6 +147,116 @@ Decode3OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2,
return MCDisassembler::Success;
}
+static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ Inst.addOperand(MCOperand::createImm(Op1));
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ Inst.addOperand(MCOperand::createImm(Op3));
+ }
+ return S;
+}
+
+static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeBitpOperand(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ Inst.addOperand(MCOperand::createImm(Op3));
+ }
+ return S;
+}
+
+static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const MCDisassembler *Decoder) {
+ unsigned Op1, Op2, Op3;
+ DecodeStatus S =
+ Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+ if (S == MCDisassembler::Success) {
+ DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+ DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+ DecodeBitpOperand(Inst, Op3, Address, Decoder);
+ }
+ return S;
+}
+
+
static DecodeStatus Decode2OpInstructionFail(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -511,115 +519,6 @@ static DecodeStatus DecodeLR2RInstruction(MCInst &Inst, unsigned Insn,
return S;
}
-static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Op1, Op2, Op3;
- DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
- if (S == MCDisassembler::Success) {
- DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
- }
- return S;
-}
-
-static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Op1, Op2, Op3;
- DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
- if (S == MCDisassembler::Success) {
- Inst.addOperand(MCOperand::createImm(Op1));
- DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
- }
- return S;
-}
-
-static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Op1, Op2, Op3;
- DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
- if (S == MCDisassembler::Success) {
- DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
- Inst.addOperand(MCOperand::createImm(Op3));
- }
- return S;
-}
-
-static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Op1, Op2, Op3;
- DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
- if (S == MCDisassembler::Success) {
- DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
- DecodeBitpOperand(Inst, Op3, Address, Decoder);
- }
- return S;
-}
-
-static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Op1, Op2, Op3;
- DecodeStatus S =
- Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
- if (S == MCDisassembler::Success) {
- DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
- }
- return S;
-}
-
-static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Op1, Op2, Op3;
- DecodeStatus S =
- Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
- if (S == MCDisassembler::Success) {
- DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
- }
- return S;
-}
-
-static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Op1, Op2, Op3;
- DecodeStatus S =
- Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
- if (S == MCDisassembler::Success) {
- DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
- Inst.addOperand(MCOperand::createImm(Op3));
- }
- return S;
-}
-
-static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- unsigned Op1, Op2, Op3;
- DecodeStatus S =
- Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
- if (S == MCDisassembler::Success) {
- DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
- DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
- DecodeBitpOperand(Inst, Op3, Address, Decoder);
- }
- return S;
-}
-
static DecodeStatus DecodeL6RInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -713,6 +612,8 @@ DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
return S;
}
+#include "XCoreGenDisassemblerTables.inc"
+
MCDisassembler::DecodeStatus
XCoreDisassembler::getInstruction(MCInst &instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index 0a86588b6bdb..1a9133aad4dd 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -12,6 +12,7 @@
#include "XCoreInstrInfo.h"
#include "XCore.h"
+#include "XCoreSubtarget.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -41,10 +42,9 @@ namespace XCore {
// Pin the vtable to this file.
void XCoreInstrInfo::anchor() {}
-XCoreInstrInfo::XCoreInstrInfo()
- : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
- RI() {
-}
+XCoreInstrInfo::XCoreInstrInfo(const XCoreSubtarget &ST)
+ : XCoreGenInstrInfo(ST, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+ RI() {}
static bool isZeroImm(const MachineOperand &op) {
return op.isImm() && op.getImm() == 0;
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h
index 5026671616fa..354339265378 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.h
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -20,12 +20,13 @@
#include "XCoreGenInstrInfo.inc"
namespace llvm {
+class XCoreSubtarget;
class XCoreInstrInfo : public XCoreGenInstrInfo {
const XCoreRegisterInfo RI;
virtual void anchor();
public:
- XCoreInstrInfo();
+ explicit XCoreInstrInfo(const XCoreSubtarget &ST);
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/llvm/lib/Target/XCore/XCoreSubtarget.cpp
index d4b777ef447f..2f6517ec9e7a 100644
--- a/llvm/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -26,5 +26,5 @@ void XCoreSubtarget::anchor() { }
XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM)
- : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(*this),
- TLInfo(TM, *this) {}
+ : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(*this),
+ FrameLowering(*this), TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index f1367037bdf4..c211777e6989 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -232,12 +232,6 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
-
- setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
- setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
} else {
setOperationAction(ISD::BITCAST, MVT::i32, Expand);
setOperationAction(ISD::BITCAST, MVT::f32, Expand);
@@ -887,6 +881,16 @@ static std::pair<unsigned, unsigned> getFPBranchKind(ISD::CondCode Cond) {
return std::make_pair(Xtensa::BF, Xtensa::OLT_S);
case ISD::SETGT:
return std::make_pair(Xtensa::BF, Xtensa::OLE_S);
+ case ISD::SETOGT:
+ return std::make_pair(Xtensa::BF, Xtensa::ULE_S);
+ case ISD::SETOGE:
+ return std::make_pair(Xtensa::BF, Xtensa::ULT_S);
+ case ISD::SETONE:
+ return std::make_pair(Xtensa::BF, Xtensa::UEQ_S);
+ case ISD::SETUGT:
+ return std::make_pair(Xtensa::BF, Xtensa::OLE_S);
+ case ISD::SETUGE:
+ return std::make_pair(Xtensa::BF, Xtensa::OLT_S);
default:
llvm_unreachable("Invalid condition!");
}
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index 55c0729a0c9e..b0f924f2cd58 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -48,7 +48,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) {
}
XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI)
- : XtensaGenInstrInfo(Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP),
+ : XtensaGenInstrInfo(STI, Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP),
RI(STI), STI(STI) {}
Register XtensaInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,