summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir16
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll152
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll57
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll76
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll76
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll76
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll3872
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll3498
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll3498
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll3872
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll109
-rw-r--r--llvm/test/CodeGen/AMDGPU/permute_i8.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll7
15 files changed, 5008 insertions, 10356 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
index 7748b481cf5b..85cfb9b320f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -82,8 +82,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
- ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[SHL]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
@@ -147,10 +147,10 @@ body: |
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
- ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
- ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[SHL]](s32)
+ ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[SHL2]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
@@ -175,8 +175,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
- ; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+ ; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[SHL]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FFBH]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index 2e64a3456c24..7932f8d1fc5b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -272,8 +272,8 @@ define half @test_rootn_f16_1(half %x) {
define half @test_rootn_f16_2(half %x) {
; CHECK-LABEL: define half @test_rootn_f16_2(
; CHECK-SAME: half [[X:%.*]]) {
-; CHECK-NEXT: [[__ROOTN2SQRT:%.*]] = call half @_Z4sqrtDh(half [[X]])
-; CHECK-NEXT: ret half [[__ROOTN2SQRT]]
+; CHECK-NEXT: [[CALL:%.*]] = call half @llvm.sqrt.f16(half [[X]]), !fpmath [[META0:![0-9]+]]
+; CHECK-NEXT: ret half [[CALL]]
;
%call = tail call half @_Z5rootnDhi(half %x, i32 2)
ret half %call
@@ -302,7 +302,8 @@ define half @test_rootn_f16_neg1(half %x) {
define half @test_rootn_f16_neg2(half %x) {
; CHECK-LABEL: define half @test_rootn_f16_neg2(
; CHECK-SAME: half [[X:%.*]]) {
-; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call half @_Z5rsqrtDh(half [[X]])
+; CHECK-NEXT: [[TMP1:%.*]] = call contract half @llvm.sqrt.f16(half [[X]])
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = fdiv contract half 0xH3C00, [[TMP1]], !fpmath [[META0]]
; CHECK-NEXT: ret half [[__ROOTN2RSQRT]]
;
%call = tail call half @_Z5rootnDhi(half %x, i32 -2)
@@ -342,8 +343,7 @@ define <2 x half> @test_rootn_v2f16_0(<2 x half> %x) {
define <2 x half> @test_rootn_v2f16_1(<2 x half> %x) {
; CHECK-LABEL: define <2 x half> @test_rootn_v2f16_1(
; CHECK-SAME: <2 x half> [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> [[X]], <2 x i32> <i32 1, i32 1>)
-; CHECK-NEXT: ret <2 x half> [[CALL]]
+; CHECK-NEXT: ret <2 x half> [[X]]
;
%call = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> %x, <2 x i32> <i32 1, i32 1>)
ret <2 x half> %call
@@ -352,7 +352,7 @@ define <2 x half> @test_rootn_v2f16_1(<2 x half> %x) {
define <2 x half> @test_rootn_v2f16_2(<2 x half> %x) {
; CHECK-LABEL: define <2 x half> @test_rootn_v2f16_2(
; CHECK-SAME: <2 x half> [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> [[X]], <2 x i32> <i32 2, i32 2>)
+; CHECK-NEXT: [[CALL:%.*]] = call <2 x half> @llvm.sqrt.v2f16(<2 x half> [[X]]), !fpmath [[META0]]
; CHECK-NEXT: ret <2 x half> [[CALL]]
;
%call = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> %x, <2 x i32> <i32 2, i32 2>)
@@ -362,8 +362,8 @@ define <2 x half> @test_rootn_v2f16_2(<2 x half> %x) {
define <2 x half> @test_rootn_v2f16_neg1(<2 x half> %x) {
; CHECK-LABEL: define <2 x half> @test_rootn_v2f16_neg1(
; CHECK-SAME: <2 x half> [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> [[X]], <2 x i32> <i32 -1, i32 -1>)
-; CHECK-NEXT: ret <2 x half> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2DIV:%.*]] = fdiv <2 x half> <half 0xH3C00, half 0xH3C00>, [[X]]
+; CHECK-NEXT: ret <2 x half> [[__ROOTN2DIV]]
;
%call = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> %x, <2 x i32> <i32 -1, i32 -1>)
ret <2 x half> %call
@@ -372,8 +372,9 @@ define <2 x half> @test_rootn_v2f16_neg1(<2 x half> %x) {
define <2 x half> @test_rootn_v2f16_neg2(<2 x half> %x) {
; CHECK-LABEL: define <2 x half> @test_rootn_v2f16_neg2(
; CHECK-SAME: <2 x half> [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> [[X]], <2 x i32> <i32 -2, i32 -2>)
-; CHECK-NEXT: ret <2 x half> [[CALL]]
+; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> [[X]])
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = fdiv contract <2 x half> <half 0xH3C00, half 0xH3C00>, [[TMP1]], !fpmath [[META0]]
+; CHECK-NEXT: ret <2 x half> [[__ROOTN2RSQRT]]
;
%call = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> %x, <2 x i32> <i32 -2, i32 -2>)
ret <2 x half> %call
@@ -512,7 +513,8 @@ define float @test_rootn_f32__y_1__strictfp(float %x) #1 {
; CHECK-LABEL: define float @test_rootn_f32__y_1__strictfp(
; CHECK-SAME: float [[X:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: ret float [[X]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 1) #[[ATTR0]]
+; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = tail call float @_Z5rootnfi(float %x, i32 1) #1
@@ -523,8 +525,7 @@ define <2 x float> @test_rootn_v2f32__y_1(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_1(
; CHECK-SAME: <2 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 1, i32 1>)
-; CHECK-NEXT: ret <2 x float> [[CALL]]
+; CHECK-NEXT: ret <2 x float> [[X]]
;
entry:
%call = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> %x, <2 x i32> <i32 1, i32 1>)
@@ -547,8 +548,7 @@ define <2 x float> @test_rootn_v2f32__y_1_undef(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_1_undef(
; CHECK-SAME: <2 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 1, i32 poison>)
-; CHECK-NEXT: ret <2 x float> [[CALL]]
+; CHECK-NEXT: ret <2 x float> [[X]]
;
entry:
%call = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> %x, <2 x i32> <i32 1, i32 poison>)
@@ -559,8 +559,7 @@ define <3 x float> @test_rootn_v3f32__y_1(<3 x float> %x) {
; CHECK-LABEL: define <3 x float> @test_rootn_v3f32__y_1(
; CHECK-SAME: <3 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> [[X]], <3 x i32> <i32 1, i32 1, i32 1>)
-; CHECK-NEXT: ret <3 x float> [[CALL]]
+; CHECK-NEXT: ret <3 x float> [[X]]
;
entry:
%call = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> %x, <3 x i32> <i32 1, i32 1, i32 1>)
@@ -571,8 +570,7 @@ define <3 x float> @test_rootn_v3f32__y_1_undef(<3 x float> %x) {
; CHECK-LABEL: define <3 x float> @test_rootn_v3f32__y_1_undef(
; CHECK-SAME: <3 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> [[X]], <3 x i32> <i32 1, i32 1, i32 poison>)
-; CHECK-NEXT: ret <3 x float> [[CALL]]
+; CHECK-NEXT: ret <3 x float> [[X]]
;
entry:
%call = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> %x, <3 x i32> <i32 1, i32 1, i32 poison>)
@@ -583,8 +581,7 @@ define <4 x float> @test_rootn_v4f32__y_1(<4 x float> %x) {
; CHECK-LABEL: define <4 x float> @test_rootn_v4f32__y_1(
; CHECK-SAME: <4 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <4 x float> @_Z5rootnDv4_fDv4_i(<4 x float> [[X]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-; CHECK-NEXT: ret <4 x float> [[CALL]]
+; CHECK-NEXT: ret <4 x float> [[X]]
;
entry:
%call = tail call <4 x float> @_Z5rootnDv4_fDv4_i(<4 x float> %x, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -595,8 +592,7 @@ define <8 x float> @test_rootn_v8f32__y_1(<8 x float> %x) {
; CHECK-LABEL: define <8 x float> @test_rootn_v8f32__y_1(
; CHECK-SAME: <8 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <8 x float> @_Z5rootnDv8_fDv8_i(<8 x float> [[X]], <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
-; CHECK-NEXT: ret <8 x float> [[CALL]]
+; CHECK-NEXT: ret <8 x float> [[X]]
;
entry:
%call = tail call <8 x float> @_Z5rootnDv8_fDv8_i(<8 x float> %x, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -607,8 +603,7 @@ define <16 x float> @test_rootn_v16f32__y_1(<16 x float> %x) {
; CHECK-LABEL: define <16 x float> @test_rootn_v16f32__y_1(
; CHECK-SAME: <16 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <16 x float> @_Z5rootnDv16_fDv16_i(<16 x float> [[X]], <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
-; CHECK-NEXT: ret <16 x float> [[CALL]]
+; CHECK-NEXT: ret <16 x float> [[X]]
;
entry:
%call = tail call <16 x float> @_Z5rootnDv16_fDv16_i(<16 x float> %x, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -619,8 +614,8 @@ define float @test_rootn_f32__y_2(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_2(
; CHECK-SAME: float [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[__ROOTN2SQRT:%.*]] = call float @_Z4sqrtf(float [[X]])
-; CHECK-NEXT: ret float [[__ROOTN2SQRT]]
+; CHECK-NEXT: [[CALL:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META0]]
+; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = tail call float @_Z5rootnfi(float %x, i32 2)
@@ -631,8 +626,8 @@ define float @test_rootn_f32__y_2_flags(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_2_flags(
; CHECK-SAME: float [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[__ROOTN2SQRT:%.*]] = call nnan nsz float @_Z4sqrtf(float [[X]])
-; CHECK-NEXT: ret float [[__ROOTN2SQRT]]
+; CHECK-NEXT: [[CALL:%.*]] = call nnan nsz float @llvm.sqrt.f32(float [[X]]), !fpmath [[META0]]
+; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = tail call nnan nsz float @_Z5rootnfi(float %x, i32 2)
@@ -644,8 +639,8 @@ define float @test_rootn_f32__y_2_fpmath_3(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_2_fpmath_3(
; CHECK-SAME: float [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[__ROOTN2SQRT:%.*]] = call nnan nsz float @_Z4sqrtf(float [[X]])
-; CHECK-NEXT: ret float [[__ROOTN2SQRT]]
+; CHECK-NEXT: [[CALL:%.*]] = call nnan nsz float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
+; CHECK-NEXT: ret float [[CALL]]
;
entry:
%call = tail call nnan nsz float @_Z5rootnfi(float %x, i32 2), !fpmath !0
@@ -656,7 +651,7 @@ define <2 x float> @test_rootn_v2f32__y_2_flags(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_2_flags(
; CHECK-SAME: <2 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call nnan nsz <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 2, i32 2>)
+; CHECK-NEXT: [[CALL:%.*]] = call nnan nsz <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META0]]
; CHECK-NEXT: ret <2 x float> [[CALL]]
;
entry:
@@ -668,7 +663,7 @@ define <3 x float> @test_rootn_v3f32__y_2(<3 x float> %x) {
; CHECK-LABEL: define <3 x float> @test_rootn_v3f32__y_2(
; CHECK-SAME: <3 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> [[X]], <3 x i32> <i32 2, i32 2, i32 2>)
+; CHECK-NEXT: [[CALL:%.*]] = call <3 x float> @llvm.sqrt.v3f32(<3 x float> [[X]]), !fpmath [[META0]]
; CHECK-NEXT: ret <3 x float> [[CALL]]
;
entry:
@@ -680,7 +675,7 @@ define <3 x float> @test_rootn_v3f32__y_2_undef(<3 x float> %x) {
; CHECK-LABEL: define <3 x float> @test_rootn_v3f32__y_2_undef(
; CHECK-SAME: <3 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> [[X]], <3 x i32> <i32 2, i32 poison, i32 2>)
+; CHECK-NEXT: [[CALL:%.*]] = call <3 x float> @llvm.sqrt.v3f32(<3 x float> [[X]]), !fpmath [[META0]]
; CHECK-NEXT: ret <3 x float> [[CALL]]
;
entry:
@@ -692,7 +687,7 @@ define <4 x float> @test_rootn_v4f32__y_2(<4 x float> %x) {
; CHECK-LABEL: define <4 x float> @test_rootn_v4f32__y_2(
; CHECK-SAME: <4 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <4 x float> @_Z5rootnDv4_fDv4_i(<4 x float> [[X]], <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
+; CHECK-NEXT: [[CALL:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[X]]), !fpmath [[META0]]
; CHECK-NEXT: ret <4 x float> [[CALL]]
;
entry:
@@ -704,7 +699,7 @@ define <8 x float> @test_rootn_v8f32__y_2(<8 x float> %x) {
; CHECK-LABEL: define <8 x float> @test_rootn_v8f32__y_2(
; CHECK-SAME: <8 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <8 x float> @_Z5rootnDv8_fDv8_i(<8 x float> [[X]], <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>)
+; CHECK-NEXT: [[CALL:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[X]]), !fpmath [[META0]]
; CHECK-NEXT: ret <8 x float> [[CALL]]
;
entry:
@@ -716,7 +711,7 @@ define <16 x float> @test_rootn_v16f32__y_2(<16 x float> %x) {
; CHECK-LABEL: define <16 x float> @test_rootn_v16f32__y_2(
; CHECK-SAME: <16 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <16 x float> @_Z5rootnDv16_fDv16_i(<16 x float> [[X]], <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>)
+; CHECK-NEXT: [[CALL:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[X]]), !fpmath [[META0]]
; CHECK-NEXT: ret <16 x float> [[CALL]]
;
entry:
@@ -740,8 +735,8 @@ define <2 x float> @test_rootn_v2f32__y_3(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_3(
; CHECK-SAME: <2 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 3, i32 3>)
-; CHECK-NEXT: ret <2 x float> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2CBRT:%.*]] = call <2 x float> @_Z4cbrtDv2_f(<2 x float> [[X]])
+; CHECK-NEXT: ret <2 x float> [[__ROOTN2CBRT]]
;
entry:
%call = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> %x, <2 x i32> <i32 3, i32 3>)
@@ -764,7 +759,7 @@ define <2 x float> @test_rootn_v2f32__y_nonsplat_2_poison(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_nonsplat_2_poison(
; CHECK-SAME: <2 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 2, i32 poison>)
+; CHECK-NEXT: [[CALL:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META0]]
; CHECK-NEXT: ret <2 x float> [[CALL]]
;
entry:
@@ -800,8 +795,8 @@ define <2 x float> @test_rootn_v2f32__y_neg1(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg1(
; CHECK-SAME: <2 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 -1, i32 -1>)
-; CHECK-NEXT: ret <2 x float> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2DIV:%.*]] = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[X]]
+; CHECK-NEXT: ret <2 x float> [[__ROOTN2DIV]]
;
entry:
%call = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> %x, <2 x i32> <i32 -1, i32 -1>)
@@ -812,8 +807,8 @@ define <3 x float> @test_rootn_v3f32__y_neg1(<3 x float> %x) {
; CHECK-LABEL: define <3 x float> @test_rootn_v3f32__y_neg1(
; CHECK-SAME: <3 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> [[X]], <3 x i32> <i32 -1, i32 -1, i32 -1>)
-; CHECK-NEXT: ret <3 x float> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2DIV:%.*]] = fdiv <3 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[X]]
+; CHECK-NEXT: ret <3 x float> [[__ROOTN2DIV]]
;
entry:
%call = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> %x, <3 x i32> <i32 -1, i32 -1, i32 -1>)
@@ -824,8 +819,8 @@ define <3 x float> @test_rootn_v3f32__y_neg1_undef(<3 x float> %x) {
; CHECK-LABEL: define <3 x float> @test_rootn_v3f32__y_neg1_undef(
; CHECK-SAME: <3 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> [[X]], <3 x i32> <i32 -1, i32 -1, i32 poison>)
-; CHECK-NEXT: ret <3 x float> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2DIV:%.*]] = fdiv <3 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[X]]
+; CHECK-NEXT: ret <3 x float> [[__ROOTN2DIV]]
;
entry:
%call = tail call <3 x float> @_Z5rootnDv3_fDv3_i(<3 x float> %x, <3 x i32> <i32 -1, i32 -1, i32 poison>)
@@ -836,8 +831,8 @@ define <4 x float> @test_rootn_v4f32__y_neg1(<4 x float> %x) {
; CHECK-LABEL: define <4 x float> @test_rootn_v4f32__y_neg1(
; CHECK-SAME: <4 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <4 x float> @_Z5rootnDv4_fDv4_i(<4 x float> [[X]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-; CHECK-NEXT: ret <4 x float> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2DIV:%.*]] = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[X]]
+; CHECK-NEXT: ret <4 x float> [[__ROOTN2DIV]]
;
entry:
%call = tail call <4 x float> @_Z5rootnDv4_fDv4_i(<4 x float> %x, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -848,8 +843,8 @@ define <8 x float> @test_rootn_v8f32__y_neg1(<8 x float> %x) {
; CHECK-LABEL: define <8 x float> @test_rootn_v8f32__y_neg1(
; CHECK-SAME: <8 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <8 x float> @_Z5rootnDv8_fDv8_i(<8 x float> [[X]], <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>)
-; CHECK-NEXT: ret <8 x float> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2DIV:%.*]] = fdiv <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[X]]
+; CHECK-NEXT: ret <8 x float> [[__ROOTN2DIV]]
;
entry:
%call = tail call <8 x float> @_Z5rootnDv8_fDv8_i(<8 x float> %x, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -860,8 +855,8 @@ define <16 x float> @test_rootn_v16f32__y_neg1(<16 x float> %x) {
; CHECK-LABEL: define <16 x float> @test_rootn_v16f32__y_neg1(
; CHECK-SAME: <16 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <16 x float> @_Z5rootnDv16_fDv16_i(<16 x float> [[X]], <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>)
-; CHECK-NEXT: ret <16 x float> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2DIV:%.*]] = fdiv <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[X]]
+; CHECK-NEXT: ret <16 x float> [[__ROOTN2DIV]]
;
entry:
%call = tail call <16 x float> @_Z5rootnDv16_fDv16_i(<16 x float> %x, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -872,7 +867,8 @@ define float @test_rootn_f32__y_neg2(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_neg2(
; CHECK-SAME: float [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT: [[TMP0:%.*]] = call contract float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = fdiv contract float 1.000000e+00, [[TMP0]], !fpmath [[META0]]
; CHECK-NEXT: ret float [[__ROOTN2RSQRT]]
;
entry:
@@ -884,7 +880,8 @@ define float @test_rootn_f32__y_neg2__flags(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_neg2__flags(
; CHECK-SAME: float [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call nnan nsz float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT: [[TMP0:%.*]] = call nnan nsz contract float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = fdiv nnan nsz contract float 1.000000e+00, [[TMP0]], !fpmath [[META0]]
; CHECK-NEXT: ret float [[__ROOTN2RSQRT]]
;
entry:
@@ -896,7 +893,7 @@ define float @test_rootn_f32__y_neg2__strictfp(float %x) #1 {
; CHECK-LABEL: define float @test_rootn_f32__y_neg2__strictfp(
; CHECK-SAME: float [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) #[[ATTR0]]
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR0]]
; CHECK-NEXT: ret float [[__ROOTN2RSQRT]]
;
entry:
@@ -908,7 +905,7 @@ define float @test_rootn_f32__y_neg2__noinline(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_neg2__noinline(
; CHECK-SAME: float [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR3:[0-9]+]]
; CHECK-NEXT: ret float [[__ROOTN2RSQRT]]
;
entry:
@@ -920,7 +917,7 @@ define float @test_rootn_f32__y_neg2__nobuiltin(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_neg2__nobuiltin(
; CHECK-SAME: float [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR4:[0-9]+]]
; CHECK-NEXT: ret float [[CALL]]
;
entry:
@@ -932,8 +929,9 @@ define <2 x float> @test_rootn_v2f32__y_neg2(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg2(
; CHECK-SAME: <2 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 -2, i32 -2>)
-; CHECK-NEXT: ret <2 x float> [[CALL]]
+; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = fdiv contract <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP0]], !fpmath [[META0]]
+; CHECK-NEXT: ret <2 x float> [[__ROOTN2RSQRT]]
;
entry:
%call = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> %x, <2 x i32> <i32 -2, i32 -2>)
@@ -944,8 +942,9 @@ define <2 x float> @test_rootn_v2f32__y_neg2__flags(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg2__flags(
; CHECK-SAME: <2 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call nnan nsz <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 -2, i32 -2>)
-; CHECK-NEXT: ret <2 x float> [[CALL]]
+; CHECK-NEXT: [[TMP0:%.*]] = call nnan nsz contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = fdiv nnan nsz contract <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP0]], !fpmath [[META0]]
+; CHECK-NEXT: ret <2 x float> [[__ROOTN2RSQRT]]
;
entry:
%call = tail call nsz nnan <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> %x, <2 x i32> <i32 -2, i32 -2>)
@@ -956,8 +955,8 @@ define <2 x float> @test_rootn_v2f32__y_neg2__strictfp(<2 x float> %x) #1 {
; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg2__strictfp(
; CHECK-SAME: <2 x float> [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 -2, i32 -2>) #[[ATTR0]]
-; CHECK-NEXT: ret <2 x float> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 -2, i32 -2>) #[[ATTR0]]
+; CHECK-NEXT: ret <2 x float> [[__ROOTN2RSQRT]]
;
entry:
%call = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> %x, <2 x i32> <i32 -2, i32 -2>) #1
@@ -1132,7 +1131,7 @@ define float @test_rootn_fast_f32_nobuiltin(float %x, i32 %y) {
; CHECK-LABEL: define float @test_rootn_fast_f32_nobuiltin(
; CHECK-SAME: float [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5rootnfi(float [[X]], i32 [[Y]]) #[[ATTR2]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5rootnfi(float [[X]], i32 [[Y]]) #[[ATTR4]]
; CHECK-NEXT: ret float [[CALL]]
;
entry:
@@ -1266,8 +1265,8 @@ define <2 x float> @test_rootn_afn_nnan_ninf_v2f32__y_3(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_rootn_afn_nnan_ninf_v2f32__y_3(
; CHECK-SAME: <2 x float> [[X:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call nnan ninf afn <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 3, i32 3>)
-; CHECK-NEXT: ret <2 x float> [[CALL]]
+; CHECK-NEXT: [[__ROOTN2CBRT:%.*]] = call nnan ninf afn <2 x float> @_Z4cbrtDv2_f(<2 x float> [[X]])
+; CHECK-NEXT: ret <2 x float> [[__ROOTN2CBRT]]
;
entry:
%call = tail call afn nnan ninf <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> %x, <2 x i32> <i32 3, i32 3>)
@@ -1427,7 +1426,7 @@ entry:
define float @test_rootn_f32__y_0_nobuiltin(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_0_nobuiltin(
; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 0) #[[ATTR2]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 0) #[[ATTR4]]
; CHECK-NEXT: ret float [[CALL]]
;
%call = tail call float @_Z5rootnfi(float %x, i32 0) #0
@@ -1437,7 +1436,7 @@ define float @test_rootn_f32__y_0_nobuiltin(float %x) {
define float @test_rootn_f32__y_1_nobuiltin(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_1_nobuiltin(
; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 1) #[[ATTR2]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 1) #[[ATTR4]]
; CHECK-NEXT: ret float [[CALL]]
;
%call = tail call float @_Z5rootnfi(float %x, i32 1) #0
@@ -1447,7 +1446,7 @@ define float @test_rootn_f32__y_1_nobuiltin(float %x) {
define float @test_rootn_f32__y_2_nobuiltin(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_2_nobuiltin(
; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 2) #[[ATTR2]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 2) #[[ATTR4]]
; CHECK-NEXT: ret float [[CALL]]
;
%call = tail call float @_Z5rootnfi(float %x, i32 2) #0
@@ -1457,7 +1456,7 @@ define float @test_rootn_f32__y_2_nobuiltin(float %x) {
define float @test_rootn_f32__y_3_nobuiltin(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_3_nobuiltin(
; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 3) #[[ATTR2]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 3) #[[ATTR4]]
; CHECK-NEXT: ret float [[CALL]]
;
%call = tail call float @_Z5rootnfi(float %x, i32 3) #0
@@ -1467,7 +1466,7 @@ define float @test_rootn_f32__y_3_nobuiltin(float %x) {
define float @test_rootn_f32__y_neg1_nobuiltin(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_neg1_nobuiltin(
; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -1) #[[ATTR2]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -1) #[[ATTR4]]
; CHECK-NEXT: ret float [[CALL]]
;
%call = tail call float @_Z5rootnfi(float %x, i32 -1) #0
@@ -1477,7 +1476,7 @@ define float @test_rootn_f32__y_neg1_nobuiltin(float %x) {
define float @test_rootn_f32__y_neg2_nobuiltin(float %x) {
; CHECK-LABEL: define float @test_rootn_f32__y_neg2_nobuiltin(
; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR2]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR4]]
; CHECK-NEXT: ret float [[CALL]]
;
%call = tail call float @_Z5rootnfi(float %x, i32 -2) #0
@@ -1492,6 +1491,11 @@ attributes #2 = { noinline }
!0 = !{float 3.0}
;.
; CHECK: attributes #[[ATTR0]] = { strictfp }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind memory(read) }
-; CHECK: attributes #[[ATTR2]] = { nobuiltin }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) }
+; CHECK: attributes #[[ATTR3]] = { noinline }
+; CHECK: attributes #[[ATTR4]] = { nobuiltin }
+;.
+; CHECK: [[META0]] = !{float 2.000000e+00}
+; CHECK: [[META1]] = !{float 3.000000e+00}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
new file mode 100644
index 000000000000..7ad24302d783
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck %s
+
+; Check that call / asm get an implicit-def $mode added to them in
+; strictfp functions.
+
+declare protected void @maybe_defs_mode() #0
+
+define float @call_changes_mode(float %x, float %y) #0 {
+ ; CHECK-LABEL: name: call_changes_mode
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $mode
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]]
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+ call void @maybe_defs_mode()
+ %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret float %val
+}
+
+define void @tail_call_changes_mode() #0 {
+ ; CHECK-LABEL: name: tail_call_changes_mode
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+ ; CHECK-NEXT: SI_TCRETURN killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit-def $mode
+ tail call void @maybe_defs_mode()
+ ret void
+}
+
+define float @asm_changes_mode(float %x, float %y) #0 {
+ ; CHECK-LABEL: name: asm_changes_mode
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */
+ ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]]
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+ call void asm sideeffect "; maybe defs mode", ""()
+ %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret float %val
+}
+
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+
+attributes #0 = { strictfp "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index d94a27e8c020..756b81909968 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -377,7 +377,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 24
+; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 24
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
@@ -452,7 +452,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 16
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3]
@@ -655,7 +655,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -760,7 +761,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -1167,7 +1169,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1705,8 +1708,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3]
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
@@ -2186,7 +2190,7 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
@@ -2278,7 +2282,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 14
+; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 14
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff
; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16
@@ -2317,7 +2321,7 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
@@ -2355,8 +2359,8 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 14, v0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 14, v1
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2394,10 +2398,13 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
ret <2 x i16> %ctlz
@@ -2439,11 +2446,15 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
ret <3 x i16> %ctlz
@@ -2492,13 +2503,20 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0
-; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, s4, 16, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
ret <4 x i16> %ctlz
@@ -2536,8 +2554,10 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
ret <2 x i8> %ctlz
@@ -2579,8 +2599,8 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 25, v0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 25, v1
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 25, v1
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index b71728096093..03434caee233 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -144,8 +144,8 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco
ret float %result
}
-define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7:[0-9]+]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
; IR-ITERATIVE: 2:
@@ -177,7 +177,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
; IR-ITERATIVE-NEXT: ret float [[TMP25]]
;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8:[0-9]+]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
; IR-DPP: 2:
@@ -213,8 +213,8 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un
ret float %result
}
-define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
; IR-ITERATIVE: 2:
@@ -262,7 +262,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un
; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0
; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]]
;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
; IR-DPP: 2:
@@ -618,8 +618,8 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns
ret float %result
}
-define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(
+define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
; IR-ITERATIVE: 2:
@@ -647,7 +647,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
; IR-ITERATIVE-NEXT: ret float [[TMP21]]
;
-; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
; IR-DPP: 2:
@@ -679,8 +679,8 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns
ret float %result
}
-define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(
+define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]]
; IR-ITERATIVE: 2:
@@ -728,7 +728,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns
; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0
; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]]
;
-; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
; IR-DPP: 2:
@@ -968,8 +968,8 @@ define amdgpu_ps float @global_atomic_fadd_div_address_div_value_agent_scope_uns
ret float %result
}
-define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
-; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
; IR-NEXT: ret float [[RESULT]]
;
@@ -977,8 +977,8 @@ define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_un
ret float %result
}
-define amdgpu_ps float @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
-; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps float @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr, float %val) #1 {
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
; IR-NEXT: ret float [[RESULT]]
;
@@ -1022,8 +1022,8 @@ define amdgpu_ps float @global_atomic_fmin_div_address_div_value_agent_scope(ptr
ret float %result
}
-define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
-; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr, float inreg %val) #1{
+; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
; IR-NEXT: ret float [[RESULT]]
;
@@ -1031,8 +1031,8 @@ define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_uns
ret float %result
}
-define amdgpu_ps float @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
-; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+define amdgpu_ps float @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr, float %val) #1{
+; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
; IR-NEXT: ret float [[RESULT]]
;
@@ -1110,8 +1110,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_a
ret double %result
}
-define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
; IR-ITERATIVE: 2:
@@ -1149,7 +1149,7 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
; IR-ITERATIVE-NEXT: ret double [[TMP31]]
;
-; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
; IR-DPP: 2:
@@ -1191,8 +1191,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_
ret double %result
}
-define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
; IR-NEXT: ret double [[RESULT]]
;
@@ -1338,8 +1338,8 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_s
ret double %result
}
-define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(
+define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]]
; IR-ITERATIVE: 2:
@@ -1373,7 +1373,7 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ]
; IR-ITERATIVE-NEXT: ret double [[TMP27]]
;
-; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]]
; IR-DPP: 2:
@@ -1411,8 +1411,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_
ret double %result
}
-define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1{
-; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(
+define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{
+; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
; IR-NEXT: ret double [[RESULT]]
;
@@ -1528,8 +1528,8 @@ define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_agent_s
ret double %result
}
-define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr, double inreg %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
; IR-NEXT: ret double [[RESULT]]
;
@@ -1537,8 +1537,8 @@ define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_one_as_
ret double %result
}
-define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr, double %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
; IR-NEXT: ret double [[RESULT]]
;
@@ -1582,8 +1582,8 @@ define amdgpu_ps double @global_atomic_fmin_double_div_address_div_value_agent_s
ret double %result
}
-define amdgpu_ps double @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1{
-; IR-LABEL: @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(
+define amdgpu_ps double @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr, double inreg %val) #1{
+; IR-LABEL: @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
; IR-NEXT: ret double [[RESULT]]
;
@@ -1591,8 +1591,8 @@ define amdgpu_ps double @global_atomic__fmax_double_div_address_uni_value_agent_
ret double %result
}
-define amdgpu_ps double @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1{
-; IR-LABEL: @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_structfp(
+define amdgpu_ps double @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr, double %val) #1{
+; IR-LABEL: @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
; IR-NEXT: ret double [[RESULT]]
;
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index b9234f47df19..239fe274d523 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -117,8 +117,8 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop
ret void
}
-define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7:[0-9]+]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
; IR-ITERATIVE: 2:
@@ -142,7 +142,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_one_as_scope_uns
; IR-ITERATIVE: 17:
; IR-ITERATIVE-NEXT: ret void
;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8:[0-9]+]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
; IR-DPP: 2:
@@ -170,8 +170,8 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_one_as_scope_uns
ret void
}
-define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
; IR-ITERATIVE: 2:
@@ -208,7 +208,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]]
;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
; IR-DPP: 2:
@@ -494,8 +494,8 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa
ret void
}
-define amdgpu_ps void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
; IR-ITERATIVE: 2:
@@ -515,7 +515,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsa
; IR-ITERATIVE: 13:
; IR-ITERATIVE-NEXT: ret void
;
-; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
; IR-DPP: 2:
@@ -539,8 +539,8 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsa
ret void
}
-define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, float %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
; IR-ITERATIVE: 2:
@@ -577,7 +577,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa
; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0
; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]]
;
-; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP33:%.*]]
; IR-DPP: 2:
@@ -774,8 +774,8 @@ define amdgpu_ps void @global_atomic_fadd_div_address_div_value_agent_scope_unsa
ret void
}
-define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
-; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
; IR-NEXT: ret void
;
@@ -783,8 +783,8 @@ define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_uns
ret void
}
-define amdgpu_ps void @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
-; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr, float %val) #1 {
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
; IR-NEXT: ret void
;
@@ -828,8 +828,8 @@ define amdgpu_ps void @global_atomic_fmin_div_address_div_value_agent_scope(ptr
ret void
}
-define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
-; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr, float inreg %val) #1{
+; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
; IR-NEXT: ret void
;
@@ -837,8 +837,8 @@ define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsa
ret void
}
-define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
-; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr, float %val) #1{
+; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
; IR-NEXT: ret void
;
@@ -902,8 +902,8 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_scope_age
ret void
}
-define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
; IR-ITERATIVE: 2:
@@ -927,7 +927,7 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_sc
; IR-ITERATIVE: 17:
; IR-ITERATIVE-NEXT: ret void
;
-; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
; IR-DPP: 2:
@@ -955,8 +955,8 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_sc
ret void
}
-define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
; IR-NEXT: ret void
;
@@ -1060,8 +1060,8 @@ define amdgpu_ps void @global_atomic_fmin_double_uni_address_div_value_agent_sco
ret void
}
-define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
; IR-ITERATIVE: 2:
@@ -1081,7 +1081,7 @@ define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_sco
; IR-ITERATIVE: 13:
; IR-ITERATIVE-NEXT: ret void
;
-; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_strictfp(
; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
; IR-DPP: 2:
@@ -1105,8 +1105,8 @@ define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_sco
ret void
}
-define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1{
-; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{
+; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
; IR-NEXT: ret void
;
@@ -1194,8 +1194,8 @@ define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_agent_sco
ret void
}
-define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr, double inreg %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
; IR-NEXT: ret void
;
@@ -1203,8 +1203,8 @@ define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_one_as_sc
ret void
}
-define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1 {
-; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr, double %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
; IR-NEXT: ret void
;
@@ -1248,8 +1248,8 @@ define amdgpu_ps void @global_atomic_fmin_double_div_address_div_value_agent_sco
ret void
}
-define amdgpu_ps void @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1{
-; IR-LABEL: @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr, double inreg %val) #1{
+; IR-LABEL: @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
; IR-NEXT: ret void
;
@@ -1257,8 +1257,8 @@ define amdgpu_ps void @global_atomic_fmax_double_div_address_uni_value_agent_sco
ret void
}
-define amdgpu_ps void @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1{
-; IR-LABEL: @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_structfp(
+define amdgpu_ps void @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr, double %val) #1{
+; IR-LABEL: @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_strictfp(
; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
; IR-NEXT: ret void
;
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index d7773f746c6a..6555ceb3ed33 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1052,8 +1052,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
ret void
}
-define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1099,7 +1099,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX7LESS-NEXT: .LBB2_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1141,7 +1141,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1181,7 +1181,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1220,7 +1220,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
@@ -1263,7 +1263,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -1303,7 +1303,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1345,7 +1345,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1385,7 +1385,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1424,7 +1424,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
@@ -1467,7 +1467,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -1511,8 +1511,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
}
-define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -1562,7 +1562,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1628,7 +1628,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1694,7 +1694,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_5:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1759,7 +1759,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-NEXT: .LBB3_5:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -1820,7 +1820,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-NEXT: .LBB3_5:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -1880,7 +1880,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: .LBB3_5:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1964,7 +1964,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: .LBB3_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2046,7 +2046,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: .LBB3_3:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2122,7 +2122,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: .LBB3_3:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -2204,7 +2204,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: .LBB3_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -3461,8 +3461,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
}
-define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -3512,7 +3512,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3578,7 +3578,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB6_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3644,7 +3644,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_5:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3709,7 +3709,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: .LBB6_5:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -3757,7 +3757,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: .LBB6_4:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -3804,7 +3804,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB6_4:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3888,7 +3888,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3970,7 +3970,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4046,7 +4046,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -4115,7 +4115,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: .LBB6_2:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -5412,1589 +5412,875 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s2
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8
-; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX7LESS-NEXT: s_waitcnt expcnt(2)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b32 s12, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2
; GFX7LESS-NEXT: .LBB9_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s42, -1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT: s_add_u32 s40, s40, s3
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-NEXT: s_mov_b64 s[38:39], 0
-; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX9-NEXT: s_add_u32 s8, s34, 44
-; GFX9-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-NEXT: s_mov_b32 s12, s33
-; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s42, -1
-; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-NEXT: s_mov_b32 s33, s2
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-NEXT: s_mov_b32 s12, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-NEXT: .LBB9_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s33, s2
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s42, -1
-; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-NEXT: s_mov_b32 s38, 0
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-NEXT: s_mov_b32 s12, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-NEXT: .LBB9_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b32 s33, s2
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b32 s12, s33
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-NEXT: .LBB9_3:
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mov_b32_e32 v40, v0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-NEXT: s_mov_b32 s38, 0
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b32 s12, s33
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-NEXT: .LBB9_3:
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s42, -1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
-; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: s_mov_b32 s12, s33
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-DPP-NEXT: .LBB9_3:
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-DPP-NEXT: .LBB9_3:
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8
ret void
}
define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s50, -1
-; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s8
-; GFX7LESS-NEXT: s_mov_b32 s40, s7
-; GFX7LESS-NEXT: s_mov_b32 s41, s6
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s46, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT: s_waitcnt expcnt(2)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s50, -1
-; GFX9-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-NEXT: s_add_u32 s48, s48, s9
-; GFX9-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT: s_mov_b32 s33, s8
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX9-NEXT: s_mov_b32 s40, s7
-; GFX9-NEXT: s_mov_b32 s41, s6
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v41, v1
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s50, -1
-; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-NEXT: s_mov_b32 s33, s8
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1064-NEXT: s_mov_b32 s40, s7
-; GFX1064-NEXT: s_mov_b32 s41, s6
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: v_mov_b32_e32 v41, v1
-; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
-; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-NEXT: s_mov_b32 s33, s8
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1032-NEXT: s_mov_b32 s40, s7
-; GFX1032-NEXT: s_mov_b32 s41, s6
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: v_mov_b32_e32 v41, v1
-; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execnz .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT: s_mov_b32 s33, s8
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: v_mov_b32_e32 v42, v0
-; GFX1164-NEXT: s_mov_b32 s40, s7
-; GFX1164-NEXT: s_mov_b32 s41, s6
-; GFX1164-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: v_mov_b32_e32 v41, v1
-; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v42
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT: s_mov_b32 s12, s41
-; GFX1164-NEXT: s_mov_b32 s13, s40
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT: s_mov_b32 s40, s14
-; GFX1132-NEXT: s_mov_b32 s41, s13
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s33, s15
-; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1132-NEXT: s_mov_b32 s44, 0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT: s_mov_b32 s12, s41
-; GFX1132-NEXT: s_mov_b32 s13, s40
-; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s50, -1
-; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b32 s33, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_mov_b32 s40, s7
-; GFX9-DPP-NEXT: s_mov_b32 s41, s6
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0
-; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value()
- %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
ret void
}
-define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
@@ -7043,7 +6329,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: .LBB11_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7086,7 +6372,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7127,7 +6413,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1064-NEXT: .LBB11_3:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7167,7 +6453,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1032-NEXT: .LBB11_3:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
@@ -7211,7 +6497,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1164-NEXT: .LBB11_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -7251,7 +6537,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7294,7 +6580,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7335,7 +6621,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7375,7 +6661,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
@@ -7419,7 +6705,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -7462,8 +6748,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
ret void
}
-define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -7516,7 +6802,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7562,7 +6848,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7608,7 +6894,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7654,7 +6940,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -7691,7 +6977,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -7726,7 +7012,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7772,7 +7058,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7818,7 +7104,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7864,7 +7150,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -7901,7 +7187,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -8887,8 +8173,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
ret void
}
-define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -8941,7 +8227,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -8987,7 +8273,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9033,7 +8319,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9079,7 +8365,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -9116,7 +8402,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -9151,7 +8437,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9197,7 +8483,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9243,7 +8529,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9289,7 +8575,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -9326,7 +8612,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -9368,1621 +8654,947 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 {
; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s2
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
+; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
-; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
-; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8
-; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX7LESS-NEXT: s_waitcnt expcnt(2)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b32 s12, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2
; GFX7LESS-NEXT: .LBB16_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s42, -1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT: s_add_u32 s40, s40, s3
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB16_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT: s_mov_b32 s1, 0x43300000
-; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX9-NEXT: s_add_u32 s8, s34, 44
-; GFX9-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-NEXT: s_mov_b32 s12, s33
-; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-NEXT: .LBB16_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s42, -1
-; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s1, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
-; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-NEXT: s_mov_b32 s12, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-NEXT: .LBB16_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s33, s2
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s42, -1
-; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-NEXT: s_mov_b32 s38, 0
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1032-NEXT: s_mov_b32 s1, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-NEXT: s_mov_b32 s12, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-NEXT: .LBB16_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20
-; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16
-; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB16_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-NEXT: s_mov_b32 s33, s2
-; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b32 s12, s33
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-NEXT: .LBB16_3:
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20
-; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16
-; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16
-; GFX1132-NEXT: s_mov_b32 s38, 0
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB16_3
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b32 s12, s33
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-NEXT: .LBB16_3:
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s42, -1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
-; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: s_mov_b32 s12, s33
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-DPP-NEXT: .LBB16_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-DPP-NEXT: .LBB16_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-DPP-NEXT: .LBB16_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
-; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
-; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-DPP-NEXT: .LBB16_3:
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
-; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
-; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
-; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-DPP-NEXT: .LBB16_3:
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 8
ret void
}
define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 {
; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s50, -1
-; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s8
-; GFX7LESS-NEXT: s_mov_b32 s40, s7
-; GFX7LESS-NEXT: s_mov_b32 s41, s6
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s46, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT: s_waitcnt expcnt(2)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s50, -1
-; GFX9-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-NEXT: s_add_u32 s48, s48, s9
-; GFX9-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT: s_mov_b32 s33, s8
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX9-NEXT: s_mov_b32 s40, s7
-; GFX9-NEXT: s_mov_b32 s41, s6
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v41, v1
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s50, -1
-; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-NEXT: s_mov_b32 s33, s8
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1064-NEXT: s_mov_b32 s40, s7
-; GFX1064-NEXT: s_mov_b32 s41, s6
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: v_mov_b32_e32 v41, v1
-; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
-; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-NEXT: s_mov_b32 s33, s8
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1032-NEXT: s_mov_b32 s40, s7
-; GFX1032-NEXT: s_mov_b32 s41, s6
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: v_mov_b32_e32 v41, v1
-; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execnz .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT: s_mov_b32 s33, s8
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: v_mov_b32_e32 v42, v0
-; GFX1164-NEXT: s_mov_b32 s40, s7
-; GFX1164-NEXT: s_mov_b32 s41, s6
-; GFX1164-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: v_mov_b32_e32 v41, v1
-; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v42
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT: s_mov_b32 s12, s41
-; GFX1164-NEXT: s_mov_b32 s13, s40
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT: s_mov_b32 s40, s14
-; GFX1132-NEXT: s_mov_b32 s41, s13
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s33, s15
-; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1132-NEXT: s_mov_b32 s44, 0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT: s_mov_b32 s12, s41
-; GFX1132-NEXT: s_mov_b32 s13, s40
-; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execnz .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s50, -1
-; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b32 s33, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_mov_b32 s40, s7
-; GFX9-DPP-NEXT: s_mov_b32 s41, s6
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0
-; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
- %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 8
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 98c09dfaa2d5..6548792180a0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -3554,1550 +3554,859 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_mov_b32 s33, s2
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b32 s12, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2
; GFX7LESS-NEXT: .LBB6_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s42, -1
-; GFX9-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: s_add_u32 s40, s40, s3
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-NEXT: s_mov_b32 s33, s2
-; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-NEXT: s_add_u32 s8, s34, 44
-; GFX9-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-NEXT: s_mov_b32 s12, s33
-; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-NEXT: .LBB6_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s42, -1
-; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-NEXT: s_mov_b32 s33, s2
-; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-NEXT: s_mov_b32 s12, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-NEXT: .LBB6_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s42, -1
-; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-NEXT: s_mov_b32 s38, 0
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-NEXT: s_mov_b32 s12, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-NEXT: .LBB6_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-NEXT: s_mov_b32 s33, s2
-; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b32 s12, s33
-; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-NEXT: .LBB6_3:
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: v_mov_b32_e32 v40, v0
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-NEXT: s_mov_b32 s38, 0
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b32 s12, s33
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-NEXT: .LBB6_3:
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s42, -1
-; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-DPP-NEXT: s_mov_b32 s33, s2
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: s_mov_b32 s12, s33
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-DPP-NEXT: .LBB6_3:
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-DPP-NEXT: .LBB6_3:
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8
ret void
}
define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s50, -1
-; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s8
-; GFX7LESS-NEXT: s_mov_b32 s40, s7
-; GFX7LESS-NEXT: s_mov_b32 s41, s6
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s46, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
-; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s50, -1
-; GFX9-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-NEXT: s_add_u32 s48, s48, s9
-; GFX9-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT: s_mov_b32 s33, s8
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX9-NEXT: s_mov_b32 s40, s7
-; GFX9-NEXT: s_mov_b32 s41, s6
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s50, -1
-; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-NEXT: s_mov_b32 s33, s8
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1064-NEXT: s_mov_b32 s40, s7
-; GFX1064-NEXT: s_mov_b32 s41, s6
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-NEXT: s_mov_b32 s33, s8
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1032-NEXT: s_mov_b32 s40, s7
-; GFX1032-NEXT: s_mov_b32 s41, s6
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT: s_mov_b32 s33, s8
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-NEXT: s_mov_b32 s40, s7
-; GFX1164-NEXT: s_mov_b32 s41, s6
-; GFX1164-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT: s_mov_b32 s12, s41
-; GFX1164-NEXT: s_mov_b32 s13, s40
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT: s_mov_b32 s40, s14
-; GFX1132-NEXT: s_mov_b32 s41, s13
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s33, s15
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s44, 0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v40
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT: s_mov_b32 s12, s41
-; GFX1132-NEXT: s_mov_b32 s13, s40
-; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s50, -1
-; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b32 s33, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_mov_b32 s40, s7
-; GFX9-DPP-NEXT: s_mov_b32 s41, s6
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
ret void
}
@@ -5963,1550 +5272,859 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_mov_b32 s33, s2
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b32 s12, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2
; GFX7LESS-NEXT: .LBB10_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s42, -1
-; GFX9-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: s_add_u32 s40, s40, s3
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB10_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-NEXT: s_mov_b32 s33, s2
-; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-NEXT: s_add_u32 s8, s34, 44
-; GFX9-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-NEXT: s_mov_b32 s12, s33
-; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-NEXT: .LBB10_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s42, -1
-; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-NEXT: s_mov_b32 s33, s2
-; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-NEXT: s_mov_b32 s12, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-NEXT: .LBB10_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s42, -1
-; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-NEXT: s_mov_b32 s38, 0
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-NEXT: s_mov_b32 s12, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-NEXT: .LBB10_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-NEXT: s_mov_b32 s33, s2
-; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b32 s12, s33
-; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-NEXT: .LBB10_3:
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: v_mov_b32_e32 v40, v0
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-NEXT: s_mov_b32 s38, 0
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b32 s12, s33
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-NEXT: .LBB10_3:
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s42, -1
-; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-DPP-NEXT: s_mov_b32 s33, s2
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: s_mov_b32 s12, s33
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-DPP-NEXT: .LBB10_3:
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-DPP-NEXT: .LBB10_3:
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 8
ret void
}
define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s50, -1
-; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s8
-; GFX7LESS-NEXT: s_mov_b32 s40, s7
-; GFX7LESS-NEXT: s_mov_b32 s41, s6
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s46, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
-; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s50, -1
-; GFX9-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-NEXT: s_add_u32 s48, s48, s9
-; GFX9-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT: s_mov_b32 s33, s8
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX9-NEXT: s_mov_b32 s40, s7
-; GFX9-NEXT: s_mov_b32 s41, s6
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s50, -1
-; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-NEXT: s_mov_b32 s33, s8
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1064-NEXT: s_mov_b32 s40, s7
-; GFX1064-NEXT: s_mov_b32 s41, s6
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-NEXT: s_mov_b32 s33, s8
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1032-NEXT: s_mov_b32 s40, s7
-; GFX1032-NEXT: s_mov_b32 s41, s6
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT: s_mov_b32 s33, s8
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-NEXT: s_mov_b32 s40, s7
-; GFX1164-NEXT: s_mov_b32 s41, s6
-; GFX1164-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT: s_mov_b32 s12, s41
-; GFX1164-NEXT: s_mov_b32 s13, s40
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT: s_mov_b32 s40, s14
-; GFX1132-NEXT: s_mov_b32 s41, s13
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s33, s15
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s44, 0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v40
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT: s_mov_b32 s12, s41
-; GFX1132-NEXT: s_mov_b32 s13, s40
-; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s50, -1
-; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b32 s33, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_mov_b32 s40, s7
-; GFX9-DPP-NEXT: s_mov_b32 s41, s6
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 8
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 1fb0db0e1f0d..6936cdc4d379 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -3554,1550 +3554,859 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_mov_b32 s33, s2
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b32 s12, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2
; GFX7LESS-NEXT: .LBB6_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s42, -1
-; GFX9-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: s_add_u32 s40, s40, s3
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-NEXT: s_mov_b32 s33, s2
-; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-NEXT: s_add_u32 s8, s34, 44
-; GFX9-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-NEXT: s_mov_b32 s12, s33
-; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-NEXT: .LBB6_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s42, -1
-; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-NEXT: s_mov_b32 s33, s2
-; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-NEXT: s_mov_b32 s12, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-NEXT: .LBB6_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s42, -1
-; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-NEXT: s_mov_b32 s38, 0
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-NEXT: s_mov_b32 s12, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-NEXT: .LBB6_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-NEXT: s_mov_b32 s33, s2
-; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b32 s12, s33
-; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-NEXT: .LBB6_3:
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: v_mov_b32_e32 v40, v0
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-NEXT: s_mov_b32 s38, 0
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b32 s12, s33
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-NEXT: .LBB6_3:
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s42, -1
-; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-DPP-NEXT: s_mov_b32 s33, s2
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: s_mov_b32 s12, s33
-; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-DPP-NEXT: .LBB6_3:
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-DPP-NEXT: .LBB6_3:
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8
ret void
}
define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s50, -1
-; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s8
-; GFX7LESS-NEXT: s_mov_b32 s40, s7
-; GFX7LESS-NEXT: s_mov_b32 s41, s6
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s46, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
-; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s50, -1
-; GFX9-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-NEXT: s_add_u32 s48, s48, s9
-; GFX9-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT: s_mov_b32 s33, s8
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX9-NEXT: s_mov_b32 s40, s7
-; GFX9-NEXT: s_mov_b32 s41, s6
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s50, -1
-; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-NEXT: s_mov_b32 s33, s8
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1064-NEXT: s_mov_b32 s40, s7
-; GFX1064-NEXT: s_mov_b32 s41, s6
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-NEXT: s_mov_b32 s33, s8
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1032-NEXT: s_mov_b32 s40, s7
-; GFX1032-NEXT: s_mov_b32 s41, s6
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT: s_mov_b32 s33, s8
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-NEXT: s_mov_b32 s40, s7
-; GFX1164-NEXT: s_mov_b32 s41, s6
-; GFX1164-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT: s_mov_b32 s12, s41
-; GFX1164-NEXT: s_mov_b32 s13, s40
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT: s_mov_b32 s40, s14
-; GFX1132-NEXT: s_mov_b32 s41, s13
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s33, s15
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s44, 0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v40
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT: s_mov_b32 s12, s41
-; GFX1132-NEXT: s_mov_b32 s13, s40
-; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s50, -1
-; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b32 s33, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_mov_b32 s40, s7
-; GFX9-DPP-NEXT: s_mov_b32 s41, s6
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
ret void
}
@@ -5963,1550 +5272,859 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_mov_b32 s33, s2
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b32 s12, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2
; GFX7LESS-NEXT: .LBB10_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s42, -1
-; GFX9-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: s_add_u32 s40, s40, s3
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB10_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-NEXT: s_mov_b32 s33, s2
-; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-NEXT: s_add_u32 s8, s34, 44
-; GFX9-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-NEXT: s_mov_b32 s12, s33
-; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-NEXT: .LBB10_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s42, -1
-; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-NEXT: s_mov_b32 s33, s2
-; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-NEXT: s_mov_b32 s12, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-NEXT: .LBB10_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s42, -1
-; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-NEXT: s_mov_b32 s38, 0
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-NEXT: s_mov_b32 s12, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-NEXT: .LBB10_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-NEXT: s_mov_b32 s33, s2
-; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b32 s12, s33
-; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-NEXT: .LBB10_3:
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: v_mov_b32_e32 v40, v0
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-NEXT: s_mov_b32 s38, 0
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b32 s12, s33
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-NEXT: .LBB10_3:
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s42, -1
-; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-DPP-NEXT: s_mov_b32 s33, s2
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: s_mov_b32 s12, s33
-; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-DPP-NEXT: .LBB10_3:
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-DPP-NEXT: .LBB10_3:
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 8
ret void
}
define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s50, -1
-; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s8
-; GFX7LESS-NEXT: s_mov_b32 s40, s7
-; GFX7LESS-NEXT: s_mov_b32 s41, s6
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s46, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
-; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s50, -1
-; GFX9-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-NEXT: s_add_u32 s48, s48, s9
-; GFX9-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT: s_mov_b32 s33, s8
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX9-NEXT: s_mov_b32 s40, s7
-; GFX9-NEXT: s_mov_b32 s41, s6
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s50, -1
-; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-NEXT: s_mov_b32 s33, s8
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1064-NEXT: s_mov_b32 s40, s7
-; GFX1064-NEXT: s_mov_b32 s41, s6
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-NEXT: s_mov_b32 s33, s8
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1032-NEXT: s_mov_b32 s40, s7
-; GFX1032-NEXT: s_mov_b32 s41, s6
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT: s_mov_b32 s33, s8
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-NEXT: s_mov_b32 s40, s7
-; GFX1164-NEXT: s_mov_b32 s41, s6
-; GFX1164-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1164-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT: s_mov_b32 s12, s41
-; GFX1164-NEXT: s_mov_b32 s13, s40
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT: s_mov_b32 s40, s14
-; GFX1132-NEXT: s_mov_b32 s41, s13
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s33, s15
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1132-NEXT: s_mov_b32 s44, 0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v40
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT: s_mov_b32 s12, s41
-; GFX1132-NEXT: s_mov_b32 s13, s40
-; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s50, -1
-; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b32 s33, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_mov_b32 s40, s7
-; GFX9-DPP-NEXT: s_mov_b32 s41, s6
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
-; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
-; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 8
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index c5f7980d1e3a..5cb57703c01d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1156,8 +1156,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
ret void
}
-define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1203,7 +1203,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX7LESS-NEXT: .LBB2_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1245,7 +1245,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1285,7 +1285,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1324,7 +1324,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
@@ -1367,7 +1367,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -1407,7 +1407,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1449,7 +1449,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1489,7 +1489,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
@@ -1571,7 +1571,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -1615,8 +1615,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
}
-define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -1666,7 +1666,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1732,7 +1732,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1798,7 +1798,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_5:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1863,7 +1863,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-NEXT: .LBB3_5:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -1924,7 +1924,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-NEXT: .LBB3_5:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -1984,7 +1984,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: .LBB3_5:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2068,7 +2068,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: .LBB3_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2150,7 +2150,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: .LBB3_3:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2226,7 +2226,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: .LBB3_3:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -2308,7 +2308,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: .LBB3_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -3617,8 +3617,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
}
-define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -3668,7 +3668,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3734,7 +3734,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB6_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3800,7 +3800,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_5:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3865,7 +3865,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: .LBB6_5:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -3926,7 +3926,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: .LBB6_5:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -3986,7 +3986,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB6_5:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4070,7 +4070,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4152,7 +4152,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4228,7 +4228,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -4310,7 +4310,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: .LBB6_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -5620,1589 +5620,875 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s2
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8
-; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX7LESS-NEXT: s_waitcnt expcnt(2)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b32 s12, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2
; GFX7LESS-NEXT: .LBB9_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s42, -1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT: s_add_u32 s40, s40, s3
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-NEXT: s_mov_b64 s[38:39], 0
-; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX9-NEXT: s_add_u32 s8, s34, 44
-; GFX9-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-NEXT: s_mov_b32 s12, s33
-; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s42, -1
-; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-NEXT: s_mov_b32 s33, s2
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-NEXT: s_mov_b32 s12, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-NEXT: .LBB9_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s33, s2
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s42, -1
-; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-NEXT: s_mov_b32 s38, 0
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-NEXT: s_mov_b32 s12, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-NEXT: .LBB9_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b32 s33, s2
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b32 s12, s33
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-NEXT: .LBB9_3:
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mov_b32_e32 v40, v0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-NEXT: s_mov_b32 s38, 0
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b32 s12, s33
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-NEXT: .LBB9_3:
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s42, -1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
-; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: s_mov_b32 s12, s33
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-DPP-NEXT: .LBB9_3:
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-DPP-NEXT: .LBB9_3:
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8
ret void
}
define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s50, -1
-; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s8
-; GFX7LESS-NEXT: s_mov_b32 s40, s7
-; GFX7LESS-NEXT: s_mov_b32 s41, s6
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s46, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT: s_waitcnt expcnt(2)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s50, -1
-; GFX9-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-NEXT: s_add_u32 s48, s48, s9
-; GFX9-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT: s_mov_b32 s33, s8
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX9-NEXT: s_mov_b32 s40, s7
-; GFX9-NEXT: s_mov_b32 s41, s6
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v41, v1
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s50, -1
-; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-NEXT: s_mov_b32 s33, s8
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1064-NEXT: s_mov_b32 s40, s7
-; GFX1064-NEXT: s_mov_b32 s41, s6
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: v_mov_b32_e32 v41, v1
-; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
-; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-NEXT: s_mov_b32 s33, s8
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1032-NEXT: s_mov_b32 s40, s7
-; GFX1032-NEXT: s_mov_b32 s41, s6
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: v_mov_b32_e32 v41, v1
-; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execnz .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT: s_mov_b32 s33, s8
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: v_mov_b32_e32 v42, v0
-; GFX1164-NEXT: s_mov_b32 s40, s7
-; GFX1164-NEXT: s_mov_b32 s41, s6
-; GFX1164-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: v_mov_b32_e32 v41, v1
-; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v42
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT: s_mov_b32 s12, s41
-; GFX1164-NEXT: s_mov_b32 s13, s40
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT: s_mov_b32 s40, s14
-; GFX1132-NEXT: s_mov_b32 s41, s13
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s33, s15
-; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1132-NEXT: s_mov_b32 s44, 0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT: s_mov_b32 s12, s41
-; GFX1132-NEXT: s_mov_b32 s13, s40
-; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s50, -1
-; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b32 s33, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_mov_b32 s40, s7
-; GFX9-DPP-NEXT: s_mov_b32 s41, s6
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0
-; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value()
- %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
ret void
}
-define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
@@ -7251,7 +6537,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: .LBB11_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7294,7 +6580,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7335,7 +6621,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1064-NEXT: .LBB11_3:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7375,7 +6661,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1032-NEXT: .LBB11_3:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
@@ -7419,7 +6705,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1164-NEXT: .LBB11_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -7459,7 +6745,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7502,7 +6788,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7543,7 +6829,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -7583,7 +6869,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
@@ -7627,7 +6913,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -7669,8 +6955,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
%result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic
ret void
}
-define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -7723,7 +7009,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7769,7 +7055,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7815,7 +7101,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7861,7 +7147,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -7898,7 +7184,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -7933,7 +7219,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -7979,7 +7265,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -8025,7 +7311,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -8071,7 +7357,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -8108,7 +7394,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -9094,8 +8380,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
ret void
}
-define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
-; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -9148,7 +8434,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9194,7 +8480,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9240,7 +8526,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9286,7 +8572,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -9323,7 +8609,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -9358,7 +8644,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9404,7 +8690,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9450,7 +8736,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -9496,7 +8782,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -9533,7 +8819,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -9574,1621 +8860,947 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 {
; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s42, -1
-; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
-; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s2
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
+; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
-; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
-; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8
-; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX7LESS-NEXT: s_waitcnt expcnt(2)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b32 s12, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2
; GFX7LESS-NEXT: .LBB16_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s42, -1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT: s_add_u32 s40, s40, s3
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB16_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT: s_mov_b32 s1, 0x43300000
-; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX9-NEXT: s_add_u32 s8, s34, 44
-; GFX9-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-NEXT: s_mov_b32 s12, s33
-; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-NEXT: .LBB16_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s42, -1
-; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s1, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
-; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-NEXT: s_mov_b32 s12, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-NEXT: .LBB16_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s33, s2
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s42, -1
-; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-NEXT: s_mov_b32 s38, 0
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1032-NEXT: s_mov_b32 s1, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-NEXT: s_mov_b32 s12, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-NEXT: .LBB16_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20
-; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16
-; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB16_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-NEXT: s_mov_b32 s33, s2
-; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b32 s12, s33
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-NEXT: .LBB16_3:
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20
-; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16
-; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16
-; GFX1132-NEXT: s_mov_b32 s38, 0
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB16_3
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b32 s12, s33
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-NEXT: .LBB16_3:
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s42, -1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
-; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
-; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: s_mov_b32 s12, s33
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-DPP-NEXT: .LBB16_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-DPP-NEXT: .LBB16_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
-; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
-; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-DPP-NEXT: .LBB16_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
-; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
-; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-DPP-NEXT: .LBB16_3:
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
-; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
-; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
-; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-DPP-NEXT: .LBB16_3:
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 8
ret void
}
define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 {
; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT: s_mov_b32 s50, -1
-; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT: s_mov_b32 s33, s8
-; GFX7LESS-NEXT: s_mov_b32 s40, s7
-; GFX7LESS-NEXT: s_mov_b32 s41, s6
-; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s46, -1
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41]
-; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT: s_waitcnt expcnt(2)
-; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT: s_mov_b32 s12, s41
-; GFX7LESS-NEXT: s_mov_b32 s13, s40
-; GFX7LESS-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
-; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s50, -1
-; GFX9-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-NEXT: s_add_u32 s48, s48, s9
-; GFX9-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT: s_mov_b32 s33, s8
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX9-NEXT: s_mov_b32 s40, s7
-; GFX9-NEXT: s_mov_b32 s41, s6
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v41, v1
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX9-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX9-NEXT: s_add_u32 s8, s36, 44
-; GFX9-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-NEXT: s_getpc_b64 s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT: s_mov_b32 s12, s41
-; GFX9-NEXT: s_mov_b32 s13, s40
-; GFX9-NEXT: s_mov_b32 s14, s33
-; GFX9-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT: s_mov_b32 s50, -1
-; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-NEXT: s_mov_b32 s33, s8
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1064-NEXT: s_mov_b32 s40, s7
-; GFX1064-NEXT: s_mov_b32 s41, s6
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: v_mov_b32_e32 v41, v1
-; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1064-NEXT: v_mov_b32_e32 v40, v0
-; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1064-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT: s_mov_b32 s12, s41
-; GFX1064-NEXT: s_mov_b32 s13, s40
-; GFX1064-NEXT: s_mov_b32 s14, s33
-; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-NEXT: s_mov_b32 s33, s8
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1032-NEXT: s_mov_b32 s40, s7
-; GFX1032-NEXT: s_mov_b32 s41, s6
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: v_mov_b32_e32 v41, v1
-; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1032-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1032-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT: s_mov_b32 s12, s41
-; GFX1032-NEXT: s_mov_b32 s13, s40
-; GFX1032-NEXT: s_mov_b32 s14, s33
-; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execnz .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT: s_mov_b32 s33, s8
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b32 s12, s6
; GFX1164-NEXT: s_mov_b32 s13, s7
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_mov_b32 s32, 32
-; GFX1164-NEXT: v_mov_b32_e32 v42, v0
-; GFX1164-NEXT: s_mov_b32 s40, s7
-; GFX1164-NEXT: s_mov_b32 s41, s6
-; GFX1164-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: v_mov_b32_e32 v41, v1
-; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1164-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1164-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-NEXT: v_mov_b32_e32 v31, v42
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT: s_mov_b32 s12, s41
-; GFX1164-NEXT: s_mov_b32 s13, s40
-; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1132-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT: s_mov_b32 s40, s14
-; GFX1132-NEXT: s_mov_b32 s41, s13
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b32 s13, s14
; GFX1132-NEXT: s_mov_b32 s14, s15
-; GFX1132-NEXT: s_mov_b32 s32, 32
-; GFX1132-NEXT: s_mov_b32 s33, s15
-; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1132-NEXT: s_mov_b32 s44, 0
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1132-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT: s_mov_b32 s12, s41
-; GFX1132-NEXT: s_mov_b32 s13, s40
-; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execnz .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT: s_mov_b32 s50, -1
-; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b32 s33, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_mov_b32 s40, s7
-; GFX9-DPP-NEXT: s_mov_b32 s41, s6
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT: s_mov_b32 s12, s41
-; GFX9-DPP-NEXT: s_mov_b32 s13, s40
-; GFX9-DPP-NEXT: s_mov_b32 s14, s33
-; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT: s_clause 0x1
-; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT: s_clause 0x1
-; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0
-; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_clause 0x1
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
-; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
-; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
-; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: s_clause 0x1
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.float.value() strictfp
- %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 8
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
index a883db1fa61f..95fc47469b51 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
@@ -26,14 +26,14 @@ define void @bar() addrspace(1) {
}
;.
-; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo.alias, ptr null }, { i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), ptr null }]
-; CHECK: @[[LLVM_GLOBAL_DTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), ptr null }]
-; CHECK: @[[__INIT_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
-; CHECK: @[[FOO_ALIAS:[a-zA-Z0-9_$"\\.-]+]] = hidden alias void (), ptr @foo
+; CHECK: @llvm.global_ctors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo.alias, ptr null }, { i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), ptr null }]
+; CHECK: @llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), ptr null }]
+; CHECK: @__init_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @__fini_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+; CHECK: @foo.alias = hidden alias void (), ptr @foo
;.
; CHECK-LABEL: define void @foo(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
index 58e1589d0483..c4f0821caacd 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
@@ -44,13 +44,13 @@ define internal void @bar() {
}
;.
-; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }]
-; CHECK: @[[LLVM_GLOBAL_DTORS:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }]
-; CHECK: @[[__INIT_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+; CHECK: @llvm.global_ctors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }]
+; CHECK: @llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }]
+; CHECK: @__init_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @__fini_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
+; CHECK: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
;.
; CHECK-LABEL: define internal void @foo() {
; CHECK-NEXT: ret void
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
new file mode 100644
index 000000000000..c7a831185b83
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s -o - | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 %s -o - | FileCheck -check-prefix=GFX11 %s
+
+define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
+; GFX9-LABEL: test:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s7, s[4:5], 0x1c
+; GFX9-NEXT: s_load_dword s8, s[4:5], 0x38
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_and_b32 s4, s7, 0xffff
+; GFX9-NEXT: s_mul_i32 s6, s6, s4
+; GFX9-NEXT: s_add_i32 s8, s8, s6
+; GFX9-NEXT: v_add_u32_e32 v0, s8, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_not_b32_e32 v3, v3
+; GFX9-NEXT: v_not_b32_e32 v2, v2
+; GFX9-NEXT: v_not_b32_e32 v1, v1
+; GFX9-NEXT: v_not_b32_e32 v0, v0
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: test:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dword s7, s[4:5], 0x1c
+; GFX10-NEXT: s_load_dword s8, s[4:5], 0x38
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_and_b32 s4, s7, 0xffff
+; GFX10-NEXT: s_mul_i32 s6, s6, s4
+; GFX10-NEXT: v_add3_u32 v0, s8, s6, v0
+; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v5, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_not_b32_e32 v3, v3
+; GFX10-NEXT: v_not_b32_e32 v2, v2
+; GFX10-NEXT: v_not_b32_e32 v1, v1
+; GFX10-NEXT: v_not_b32_e32 v0, v0
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: test:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x1c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x38
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_mul_i32 s15, s15, s4
+; GFX11-NEXT: v_add3_u32 v0, s5, s15, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v5, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_not_b32_e32 v3, v3
+; GFX11-NEXT: v_not_b32_e32 v2, v2
+; GFX11-NEXT: v_not_b32_e32 v1, v1
+; GFX11-NEXT: v_not_b32_e32 v0, v0
+; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+entry:
+ %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %arg.1.ptr = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 40
+ %arg.1 = load i64, ptr addrspace(4) %arg.1.ptr, align 8
+ %workgroup.id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %arg.2.ptr = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 12
+ %arg.2 = load i16, ptr addrspace(4) %arg.2.ptr, align 4
+ %arg.2.ext = zext i16 %arg.2 to i32
+ %mul = mul i32 %workgroup.id.x, %arg.2.ext
+ %workitem.id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %add = add i32 %mul, %workitem.id.x
+ %add.ext = zext i32 %add to i64
+ %add.1 = add i64 %arg.1, %add.ext
+ %sext = shl i64 %add.1, 32
+ %idxprom = ashr exact i64 %sext, 32
+ %arrayidx = getelementptr inbounds <16 x i8>, ptr addrspace(1) %src, i64 %idxprom
+ %arrayval = load <16 x i8>, ptr addrspace(1) %arrayidx, align 16
+ %not = xor <16 x i8> %arrayval, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %arrayidx2 = getelementptr inbounds <16 x i8>, ptr addrspace(1) %dst, i64 %idxprom
+ store <16 x i8> %not, ptr addrspace(1) %arrayidx2, align 16
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 8ac332197215..7ca9ae359a49 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3816,13 +3816,15 @@ define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-LABEL: extract_v13i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
+; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_perm_b32 v0, v9, v8, 0x3020504
+; GFX10-NEXT: v_perm_b32 v0, v12, v13, 0x1000504
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v1, v11, v12, 0x1000706
+; GFX10-NEXT: v_perm_b32 v1, v10, v14, 0x1000504
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: global_store_dword v[6:7], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -3830,14 +3832,15 @@ define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-LABEL: extract_v13i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
-; GFX9-NEXT: s_mov_b32 s4, 0x3020504
-; GFX9-NEXT: s_mov_b32 s5, 0x1000706
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX9-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
+; GFX9-NEXT: s_mov_b32 s4, 0x1000504
+; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_perm_b32 v0, v9, v8, s4
+; GFX9-NEXT: v_perm_b32 v0, v12, v13, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v1, v11, v12, s5
+; GFX9-NEXT: v_perm_b32 v1, v10, v14, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: global_store_dword v[6:7], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 54ca33401ccf..5a241f85b2e2 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -475,8 +475,7 @@ entry:
declare float @_Z5rootnfi(float, i32)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
-; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 2)
-; GCN-PRELINK: %__rootn2sqrt = tail call fast float @llvm.sqrt.f32(float %tmp)
+; GCN: call fast float @llvm.sqrt.f32(float %tmp)
define amdgpu_kernel void @test_rootn_2(ptr addrspace(1) nocapture %a) {
entry:
%tmp = load float, ptr addrspace(1) %a, align 4
@@ -507,8 +506,8 @@ entry:
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
-; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 -2)
-; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+; GCN: [[SQRT:%.+]] = tail call fast float @llvm.sqrt.f32(float %tmp)
+; GCN-NEXT: fdiv fast float 1.000000e+00, [[SQRT]]
define amdgpu_kernel void @test_rootn_m2(ptr addrspace(1) nocapture %a) {
entry:
%tmp = load float, ptr addrspace(1) %a, align 4