Skip to content

Instantly share code, notes, and snippets.

@HansKristian-Work
Created June 16, 2023 14:38
Show Gist options
  • Select an option

  • Save HansKristian-Work/611f7cdfb1d15c0b34fd1ada512af369 to your computer and use it in GitHub Desktop.

Select an option

Save HansKristian-Work/611f7cdfb1d15c0b34fd1ada512af369 to your computer and use it in GitHub Desktop.
SM 6.0 strict wave op tests
struct VSOut
{
float4 pos : SV_Position;
float2 uv : TEXCOORD;
};
VSOut vs_main(float4 pos : POSITION)
{
VSOut vout;
vout.pos = pos;
vout.uv = pos.xy * 0.5 + 0.5;
return vout;
}
Texture2D<float> T : register(t0);
SamplerState S : register(s1);
RWStructuredBuffer<uint> U : register(u2);
// WaveReadLaneFirst
float _WaveReadLaneFirst(VSOut vin)
{
float s = T.Sample(S, vin.uv);
s += WaveReadLaneFirst(s);
return s;
}
float sm60_WaveReadLaneFirst(VSOut vin) : SV_Target
{
return _WaveReadLaneFirst(vin);
}
[WaveOpsIncludeHelperLanes]
float sm67_WaveReadLaneFirst(VSOut vin) : SV_Target
{
return _WaveReadLaneFirst(vin);
}
// WaveActiveAllEqual
float _WaveActiveAllEqual(VSOut vin)
{
float s = T.Sample(S, vin.uv);
s += float(WaveActiveAllEqual(s));
return s;
}
float sm60_WaveActiveAllEqual(VSOut vin) : SV_Target
{
return _WaveActiveAllEqual(vin);
}
[WaveOpsIncludeHelperLanes]
float sm67_WaveActiveAllEqual(VSOut vin) : SV_Target
{
return _WaveActiveAllEqual(vin);
}
// WaveActiveCountBits
float _WaveActiveCountBits(VSOut vin)
{
float s = T.Sample(S, vin.uv);
s += float(WaveActiveCountBits(s));
return s;
}
float sm60_WaveActiveCountBits(VSOut vin) : SV_Target
{
return _WaveActiveCountBits(vin);
}
[WaveOpsIncludeHelperLanes]
float sm67_WaveActiveCountBits(VSOut vin) : SV_Target
{
return _WaveActiveCountBits(vin);
}
// WaveActiveSum
float _WaveActiveMin(VSOut vin)
{
float s = T.Sample(S, vin.uv);
s += WaveActiveMin(s);
return s;
}
float sm60_WaveActiveMin(VSOut vin) : SV_Target
{
return _WaveActiveMin(vin);
}
[WaveOpsIncludeHelperLanes]
float sm67_WaveActiveMin(VSOut vin) : SV_Target
{
return _WaveActiveMin(vin);
}
// WavePrefixProduct
float _WavePrefixProduct(VSOut vin)
{
float s = T.Sample(S, vin.uv);
s += WavePrefixProduct(s);
return s;
}
float sm60_WavePrefixProduct(VSOut vin) : SV_Target
{
return _WavePrefixProduct(vin);
}
[WaveOpsIncludeHelperLanes]
float sm67_WavePrefixProduct(VSOut vin) : SV_Target
{
return _WavePrefixProduct(vin);
}
// WaveIsLaneFirst
float _WaveIsFirstLane(VSOut vin)
{
float s = T.Sample(S, vin.uv);
if (WaveIsFirstLane())
U[0] = 1000;
return s;
}
float sm60_WaveIsFirstLane(VSOut vin) : SV_Target
{
return _WaveIsFirstLane(vin);
}
[WaveOpsIncludeHelperLanes]
float sm67_WaveIsFirstLane(VSOut vin) : SV_Target
{
return _WaveIsFirstLane(vin);
}
#!/bin/bash
set -e
run_test() {
echo "Building $1"
DXC=./external/dxc-build/bin/dxc
DXIL_SPIRV=./cmake-build-release/dxil-spirv
$DXC -Tvs_6_0 -Fo vert.dxil test.hlsl -E vs_main
$DXC -Tps_6_0 -Fo frag60.dxil test.hlsl -E sm60_$1
$DXC -Tps_6_7 -Fo frag67.dxil test.hlsl -E sm67_$1
$DXIL_SPIRV --output vert.spv vert.dxil
$DXIL_SPIRV --output frag60.spv frag60.dxil
$DXIL_SPIRV --output frag67.spv frag67.dxil
fossilize-synth --vert vert.spv --frag frag60.spv --output sm60.foz
fossilize-synth --vert vert.spv --frag frag67.spv --output sm67.foz
mkdir -p /tmp/sm60-output
mkdir -p /tmp/sm67-output
rm /tmp/sm60-output/*
rm /tmp/sm67-output/*
fossilize-disasm --target isa --output /tmp/sm60-output sm60.foz
fossilize-disasm --target isa --output /tmp/sm67-output sm67.foz
echo "== With helper lanes =="
cat /tmp/sm67-output/*.frag
echo "== Without helper lanes =="
cat /tmp/sm60-output/*.frag
}
run_test WaveReadLaneFirst > WaveReadLaneFirst.txt
run_test WaveActiveAllEqual > WaveActiveAllEqual.txt
run_test WaveActiveCountBits > WaveActiveCountBits.txt
run_test WaveActiveMin > WaveActiveMin.txt
run_test WavePrefixProduct > WavePrefixProduct.txt
run_test WaveIsFirstLane > WaveIsFirstLane.txt
== With helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0xac104c1f, 0x83f461d4, 0x782a7d10, 0x760c676a, 0x9f393254}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x21
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 32 div ssa_12 = mov ssa_11.x
vec1 32 con ssa_13 = intrinsic read_first_invocation (ssa_12) ()
vec1 1 div ssa_14 = feq ssa_13, ssa_11.x
vec1 1 con ssa_15 = intrinsic vote_all (ssa_14) ()
vec1 32 con ssa_16 = b2f32 ssa_15
vec1 32 div ssa_17 = fadd ssa_16, ssa_11.x
vec1 32 con ssa_18 = undefined
vec1 32 div ssa_19 = pack_half_2x16_split ssa_17, ssa_18
vec1 32 con ssa_20 = undefined
vec4 32 div ssa_21 = vec4 ssa_19, ssa_20, ssa_20, ssa_20
intrinsic export_amd (ssa_21) (base=0, wrmask=xy, flags=7)
/* succs: block_1 */
block block_1:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: uniform, top-level, export_end, */
s2: %23:s[0-1], s1: %24:s[2], s1: %25:s[3], s1: %26:s[4], v2: %27:v[0-1] = p_startpgm
s2: (kill)%29, s1: (kill)%28:scc = p_init_scratch (kill)%23, (latekill)(kill)%26
s2: %0:exec, s1: (kill)%61:scc = s_wqm_b64 %0:exec
p_logical_start
v1: %30, v1: %31 = p_split_vector (kill)%27
s2: %4 = p_create_vector (kill)%24, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %46 = v_interp_p1_f32 %30, %25:m0 attr0.y
v1: %9 = v_interp_p2_f32 %31, %25:m0, (kill)%46 attr0.y
v1: %47 = v_interp_p1_f32 (kill)%30, %25:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%31, (kill)%25:m0, (kill)%47 attr0.x
v1: %49 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %48 = p_wqm (kill)%49
s1: %50 = v_readfirstlane_b32 %48
s1: %14 = p_wqm (kill)%50
s2: %52 = v_cmp_neq_f32 (kill)%14, %48
s2: (kill)%54, s1: %53:scc = s_and_b64 (kill)%52, %0:exec
s1: %55:scc = p_wqm (kill)%53
s2: %56 = s_cselect_b64 -1, 0, (kill)%55:scc
s2: (kill)%16, s1: %57:scc = s_not_b64 (kill)%56
s1: %17 = s_mul_i32 1.0, (kill)%57
v1: %18 = v_add_f32 (kill)%17, (kill)%48
v1: %20 = v_cvt_pkrtz_f16_f32 (kill)%18, 0
exp (kill)%20, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_wqm_b64 exec, exec ; befe0a7e
s_mov_b32 s0, s3 ; be800303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040
s_mov_b32 m0, s0 ; befc0300
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002
s_waitcnt vmcnt(0) ; bf8c3f70
v_readfirstlane_b32 s0, v0 ; 7e000500
v_cmp_neq_f32_e32 vcc, s0, v0 ; 7c1a0000
s_and_b64 s[0:1], vcc, exec ; 87807e6a
s_cselect_b64 s[0:1], -1, 0 ; 858080c1
s_not_b64 s[0:1], s[0:1] ; be800800
s_mul_i32 s0, 1.0, src_scc ; 9300fdf2
v_add_f32_e32 v0, s0, v0 ; 06000000
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 13511583485773036569
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 120
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 2542461091
Instructions (Instruction count): 24
Copies (Copy instructions created for pseudo-instructions): 3
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 411
Inverse Throughput (Estimated busy cycles to execute one wave): 26
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 13
Pre-Sched VGPRs (VGPR usage before scheduling): 3
== Without helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0xf4e791ce, 0x809f4864, 0xda6780d1, 0xf4febaa1, 0x1a634f83}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
system_values_read: 0x00000000'00000000'08000000
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x21
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 1 div ssa_12 = intrinsic load_helper_invocation () ()
/* succs: block_1 block_2 */
if ssa_12 {
block block_1:
/* preds: block_0 */
vec1 1 con ssa_13 = undefined
/* succs: block_3 */
} else {
block block_2:
/* preds: block_0 */
vec1 32 div ssa_14 = mov ssa_11.x
vec1 32 con ssa_15 = intrinsic read_first_invocation (ssa_14) ()
vec1 1 div ssa_16 = feq ssa_15, ssa_11.x
vec1 1 con ssa_17 = intrinsic vote_all (ssa_16) ()
/* succs: block_3 */
}
block block_3:
/* preds: block_1 block_2 */
vec1 1 con ssa_18 = phi block_1: ssa_13, block_2: ssa_17
vec1 32 con ssa_19 = b2f32 ssa_18
vec1 32 div ssa_20 = fadd ssa_19, ssa_11.x
vec1 32 con ssa_21 = undefined
vec1 32 div ssa_22 = pack_half_2x16_split ssa_20, ssa_21
vec1 32 con ssa_23 = undefined
vec4 32 div ssa_24 = vec4 ssa_22, ssa_23, ssa_23, ssa_23
intrinsic export_amd (ssa_24) (base=0, wrmask=xy, flags=7)
/* succs: block_4 */
block block_4:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: top-level, branch, needs_lowering, */
s2: %26:s[0-1], s1: %27:s[2], s1: %28:s[3], s1: %29:s[4], v2: %30:v[0-1] = p_startpgm
s2: (kill)%32, s1: (kill)%31:scc = p_init_scratch (kill)%26, (latekill)(kill)%29
s2: %70 = p_parallelcopy %0:exec
s2: %0:exec, s1: (kill)%71:scc = s_wqm_b64 %70
p_logical_start
v1: %33, v1: %34 = p_split_vector (kill)%30
s2: %4 = p_create_vector (kill)%27, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %49 = v_interp_p1_f32 %33, %28:m0 attr0.y
v1: %9 = v_interp_p2_f32 %34, %28:m0, (kill)%49 attr0.y
v1: %50 = v_interp_p1_f32 (kill)%33, %28:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%34, (kill)%28:m0, (kill)%50 attr0.x
v1: %52 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %51 = p_wqm (kill)%52
s2: %13, s1: (kill)%72:scc = s_andn2_b64 %0:exec, %70
p_logical_end
s2: %74, s1: (kill)%73:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%13, %0:exec
s2: (kill)%75 = p_cbranch_z %0:exec BB2, BB1
BB1
/* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
p_logical_start
p_logical_end
s2: (kill)%54 = p_branch BB3
BB2
/* logical preds: / linear preds: BB0, / kind: uniform, */
s2: (kill)%55 = p_branch BB3
BB3
/* logical preds: / linear preds: BB1, BB2, / kind: invert, */
s2: %0:exec, s1: (kill)%76:scc = s_andn2_b64 (kill)%74, %0:exec
s2: (kill)%77 = p_cbranch_z %0:exec BB5, BB4
BB4
/* logical preds: BB0, / linear preds: BB3, / kind: uniform, */
p_logical_start
s1: %57 = v_readfirstlane_b32 %51
s1: %16 = p_wqm (kill)%57
s2: %59 = v_cmp_neq_f32 (kill)%16, %51
s2: (kill)%61, s1: %60:scc = s_and_b64 (kill)%59, %0:exec
s1: %62:scc = p_wqm (kill)%60
s2: %63 = s_cselect_b64 -1, 0, (kill)%62:scc
s2: %18, s1: (kill)%64:scc = s_not_b64 (kill)%63
p_logical_end
s2: (kill)%65 = p_branch BB6
BB5
/* logical preds: / linear preds: BB3, / kind: uniform, */
s2: (kill)%66 = p_branch BB6
BB6
/* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, export_end, */
s2: %19 = p_linear_phi (kill)%18, s2: undef
s2: %0:exec = p_parallelcopy (kill)%70
p_logical_start
s2: (kill)%68, s1: %67:scc = s_and_b64 (kill)%19, %0:exec
s1: %20 = s_mul_i32 1.0, (kill)%67
v1: %21 = v_add_f32 (kill)%20, (kill)%51
v1: %23 = v_cvt_pkrtz_f16_f32 (kill)%21, 0
exp (kill)%23, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_mov_b64 s[0:1], exec ; be80047e
s_wqm_b64 exec, s[0:1] ; befe0a00
s_mov_b32 s4, s3 ; be840303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040
s_mov_b32 m0, s4 ; befc0304
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002
s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e
s_and_saveexec_b64 s[2:3], s[2:3] ; be822402
BB3:
s_andn2_b64 exec, s[2:3], exec ; 8afe7e02
BB4:
s_waitcnt vmcnt(0) ; bf8c3f70
v_readfirstlane_b32 s2, v0 ; 7e040500
v_cmp_neq_f32_e32 vcc, s2, v0 ; 7c1a0002
s_and_b64 s[2:3], vcc, exec ; 87827e6a
s_cselect_b64 s[2:3], -1, 0 ; 858280c1
s_not_b64 s[2:3], s[2:3] ; be820802
BB6:
s_mov_b64 exec, s[0:1] ; befe0400
s_and_b64 s[0:1], s[2:3], exec ; 87807e02
s_mul_i32 s0, 1.0, src_scc ; 9300fdf2
s_waitcnt vmcnt(0) ; bf8c3f70
v_add_f32_e32 v0, s0, v0 ; 06000000
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 8436903613772059513
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 148
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 989438680
Instructions (Instruction count): 31
Copies (Copy instructions created for pseudo-instructions): 5
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 643
Inverse Throughput (Estimated busy cycles to execute one wave): 40
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 15
Pre-Sched VGPRs (VGPR usage before scheduling): 3
Building WaveActiveCountBits
== With helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0xce583982, 0xdf4de895, 0x64f1e9e3, 0xafc370e1, 0x3389e093}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x61
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 1 div ssa_12 = fneu! ssa_11.x, ssa_4
vec1 64 con ssa_13 = intrinsic ballot (ssa_12) ()
vec2 32 con ssa_14 = unpack_64_2x32 ssa_13
vec1 64 con ssa_15 = pack_64_2x32_split ssa_14.x, ssa_14.y
vec1 32 con ssa_16 = bit_count ssa_15
vec1 32 con ssa_17 = u2f32 ssa_16
vec1 32 div ssa_18 = fadd ssa_17, ssa_11.x
vec1 32 con ssa_19 = undefined
vec1 32 div ssa_20 = pack_half_2x16_split ssa_18, ssa_19
vec1 32 con ssa_21 = undefined
vec4 32 div ssa_22 = vec4 ssa_20, ssa_21, ssa_21, ssa_21
intrinsic export_amd (ssa_22) (base=0, wrmask=xy, flags=7)
/* succs: block_1 */
block block_1:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: uniform, top-level, export_end, */
s2: %24:s[0-1], s1: %25:s[2], s1: %26:s[3], s1: %27:s[4], v2: %28:v[0-1] = p_startpgm
s2: (kill)%30, s1: (kill)%29:scc = p_init_scratch (kill)%24, (latekill)(kill)%27
s2: %0:exec, s1: (kill)%57:scc = s_wqm_b64 %0:exec
p_logical_start
v1: %31, v1: %32 = p_split_vector (kill)%28
s2: %4 = p_create_vector (kill)%25, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %47 = v_interp_p1_f32 %31, %26:m0 attr0.y
v1: %9 = v_interp_p2_f32 %32, %26:m0, (kill)%47 attr0.y
v1: %48 = v_interp_p1_f32 (kill)%31, %26:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%32, (kill)%26:m0, (kill)%48 attr0.x
v1: %50 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %49 = p_wqm (kill)%50
s2: %52 = v_cmp_neq_f32 0, %49
s2: %14 = p_wqm (kill)%52
s1: %17, s1: (kill)%55:scc = s_bcnt1_i32_b64 (kill)%14
v1: %18 = v_cvt_f32_u32 (kill)%17
v1: %19 = v_add_f32 (kill)%18, (kill)%49
v1: %21 = v_cvt_pkrtz_f16_f32 (kill)%19, 0
exp (kill)%21, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_wqm_b64 exec, exec ; befe0a7e
s_mov_b32 s0, s3 ; be800303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040
s_mov_b32 m0, s0 ; befc0300
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002
s_waitcnt vmcnt(0) ; bf8c3f70
v_cmp_neq_f32_e32 vcc, 0, v0 ; 7c1a0080
s_bcnt1_i32_b64 s0, vcc ; be80106a
v_cvt_f32_u32_e32 v1, s0 ; 7e020c00
v_add_f32_e32 v0, v1, v0 ; 06000101
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 18019751637580729524
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 108
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 703231431
Instructions (Instruction count): 21
Copies (Copy instructions created for pseudo-instructions): 3
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 405
Inverse Throughput (Estimated busy cycles to execute one wave): 25
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 13
Pre-Sched VGPRs (VGPR usage before scheduling): 3
== Without helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0x3e248076, 0x7c0b38d3, 0x7724c093, 0xba94b689, 0x075cef04}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
system_values_read: 0x00000000'00000000'08000000
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x61
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 1 div ssa_12 = intrinsic load_helper_invocation () ()
vec1 1 div ssa_13 = inot ssa_12
vec1 1 div ssa_14 = fneu! ssa_11.x, ssa_4
vec1 1 div ssa_15 = iand ssa_14, ssa_13
vec1 64 con ssa_16 = intrinsic ballot (ssa_15) ()
vec2 32 con ssa_17 = unpack_64_2x32 ssa_16
vec1 64 con ssa_18 = pack_64_2x32_split ssa_17.x, ssa_17.y
vec1 32 con ssa_19 = bit_count ssa_18
vec1 32 con ssa_20 = u2f32 ssa_19
vec1 32 div ssa_21 = fadd ssa_20, ssa_11.x
vec1 32 con ssa_22 = undefined
vec1 32 div ssa_23 = pack_half_2x16_split ssa_21, ssa_22
vec1 32 con ssa_24 = undefined
vec4 32 div ssa_25 = vec4 ssa_23, ssa_24, ssa_24, ssa_24
intrinsic export_amd (ssa_25) (base=0, wrmask=xy, flags=7)
/* succs: block_1 */
block block_1:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: uniform, top-level, needs_lowering, export_end, */
s2: %27:s[0-1], s1: %28:s[2], s1: %29:s[3], s1: %30:s[4], v2: %31:v[0-1] = p_startpgm
s2: (kill)%33, s1: (kill)%32:scc = p_init_scratch (kill)%27, (latekill)(kill)%30
s2: %62 = p_parallelcopy %0:exec
s2: %0:exec, s1: (kill)%63:scc = s_wqm_b64 %62
p_logical_start
v1: %34, v1: %35 = p_split_vector (kill)%31
s2: %4 = p_create_vector (kill)%28, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %50 = v_interp_p1_f32 %34, %29:m0 attr0.y
v1: %9 = v_interp_p2_f32 %35, %29:m0, (kill)%50 attr0.y
v1: %51 = v_interp_p1_f32 (kill)%34, %29:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%35, (kill)%29:m0, (kill)%51 attr0.x
v1: %53 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %52 = p_wqm (kill)%53
s2: %13, s1: (kill)%64:scc = s_andn2_b64 %0:exec, %62
s2: %15 = v_cmp_neq_f32 0, %52
s2: %16, s1: (kill)%55:scc = s_andn2_b64 (kill)%15, (kill)%13
s2: %57, s1: (kill)%56:scc = s_and_b64 (kill)%16, %0:exec
s2: %17 = p_wqm (kill)%57
s1: %20, s1: (kill)%60:scc = s_bcnt1_i32_b64 (kill)%17
v1: %21 = v_cvt_f32_u32 (kill)%20
v1: %22 = v_add_f32 (kill)%21, (kill)%52
v1: %24 = v_cvt_pkrtz_f16_f32 (kill)%22, 0
s2: %0:exec = p_parallelcopy (kill)%62
exp (kill)%24, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_mov_b64 s[0:1], exec ; be80047e
s_wqm_b64 exec, s[0:1] ; befe0a00
s_mov_b32 s4, s3 ; be840303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040
s_mov_b32 m0, s4 ; befc0304
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002
s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e
s_waitcnt vmcnt(0) ; bf8c3f70
v_cmp_neq_f32_e32 vcc, 0, v0 ; 7c1a0080
s_andn2_b64 s[2:3], vcc, s[2:3] ; 8a82026a
s_and_b64 s[2:3], s[2:3], exec ; 87827e02
s_bcnt1_i32_b64 s2, s[2:3] ; be821002
v_cvt_f32_u32_e32 v1, s2 ; 7e020c02
v_add_f32_e32 v0, v1, v0 ; 06000101
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
s_mov_b64 exec, s[0:1] ; befe0400
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 6386560257104097910
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 128
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 4280024007
Instructions (Instruction count): 26
Copies (Copy instructions created for pseudo-instructions): 5
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 411
Inverse Throughput (Estimated busy cycles to execute one wave): 26
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 15
Pre-Sched VGPRs (VGPR usage before scheduling): 3
Building WaveActiveMin
== With helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0xf1bea900, 0xc5951ac5, 0x5e86f07e, 0x460754e6, 0xef2be9aa}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x20
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 32 div ssa_12 = mov ssa_11.x
vec1 32 con ssa_13 = intrinsic reduce (ssa_12) (reduction_op=fmin, cluster_size=0)
vec1 32 div ssa_14 = fadd ssa_11.x, ssa_13
vec1 32 con ssa_15 = undefined
vec1 32 div ssa_16 = pack_half_2x16_split ssa_14, ssa_15
vec1 32 con ssa_17 = undefined
vec4 32 div ssa_18 = vec4 ssa_16, ssa_17, ssa_17, ssa_17
intrinsic export_amd (ssa_18) (base=0, wrmask=xy, flags=7)
/* succs: block_1 */
block block_1:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: uniform, top-level, export_end, */
s2: %20:s[0-1], s1: %21:s[2], s1: %22:s[3], s1: %23:s[4], v2: %24:v[0-1] = p_startpgm
s2: (kill)%26, s1: (kill)%25:scc = p_init_scratch (kill)%20, (latekill)(kill)%23
s2: %0:exec, s1: (kill)%53:scc = s_wqm_b64 %0:exec
p_logical_start
v1: %27, v1: %28 = p_split_vector (kill)%24
s2: %4 = p_create_vector (kill)%21, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %43 = v_interp_p1_f32 %27, %22:m0 attr0.y
v1: %9 = v_interp_p2_f32 %28, %22:m0, (kill)%43 attr0.y
v1: %44 = v_interp_p1_f32 (kill)%27, %22:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%28, (kill)%22:m0, (kill)%44 attr0.x
v1: %46 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %45 = p_wqm (kill)%46
lv1: %51 = p_start_linear_vgpr
lv1: %52 = p_start_linear_vgpr
s1: %47, s2: (kill)%48, s1: (kill)%49:scc = p_reduce %45, (kill)%51, (kill)%52 op:fmin32 cluster_size:64
s1: %14 = p_wqm (kill)%47
v1: %15 = v_add_f32 (kill)%14, (kill)%45
v1: %17 = v_cvt_pkrtz_f16_f32 (kill)%15, 0
exp (kill)%17, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_wqm_b64 exec, exec ; befe0a7e
s_mov_b32 s0, s3 ; be800303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040
s_mov_b32 m0, s0 ; befc0300
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002
s_or_saveexec_b64 s[2:3], -1 ; be8225c1
v_mov_b32_e32 v1, 0x7f800000 ; 7e0202ff 7f800000
s_waitcnt vmcnt(0) ; bf8c3f70
v_cndmask_b32_e64 v1, v1, v0, s[2:3] ; d5010001 000a0101
v_min_f32_dpp v1, v1, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff04b101
v_min_f32_dpp v1, v1, v1 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff044e01
v_min_f32_dpp v1, v1, v1 row_half_mirror row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff054101
v_min_f32_dpp v1, v1, v1 row_mirror row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff054001
v_permlanex16_b32 v2, v1, 0, 0 ; d7780002 02010101
v_min_f32_e32 v1, v1, v2 ; 1e020501
v_readlane_b32 s0, v1, 0 ; d7600000 00010101
v_min_f32_e32 v1, s0, v1 ; 1e020200
s_mov_b64 exec, s[2:3] ; befe0402
v_readlane_b32 s0, v1, 63 ; d7600000 00017f01
v_add_f32_e32 v0, s0, v0 ; 06000000
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 17299820867343601478
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 184
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 1489409235
Instructions (Instruction count): 31
Copies (Copy instructions created for pseudo-instructions): 3
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 451
Inverse Throughput (Estimated busy cycles to execute one wave): 34
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 13
Pre-Sched VGPRs (VGPR usage before scheduling): 3
== Without helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0x1ab71855, 0x6f50287d, 0xabdc20b6, 0x81c20f8c, 0xa55f34d4}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
system_values_read: 0x00000000'00000000'08000000
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x21
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 1 div ssa_12 = intrinsic load_helper_invocation () ()
vec1 32 con ssa_13 = load_const (0x7f800000 = inf)
vec1 32 div ssa_14 = bcsel ssa_12, ssa_13, ssa_11.x
vec1 32 con ssa_15 = intrinsic reduce (ssa_14) (reduction_op=fmin, cluster_size=0)
vec1 32 div ssa_16 = fadd ssa_11.x, ssa_15
vec1 32 con ssa_17 = undefined
vec1 32 div ssa_18 = pack_half_2x16_split ssa_16, ssa_17
vec1 32 con ssa_19 = undefined
vec4 32 div ssa_20 = vec4 ssa_18, ssa_19, ssa_19, ssa_19
intrinsic export_amd (ssa_20) (base=0, wrmask=xy, flags=7)
/* succs: block_1 */
block block_1:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: uniform, top-level, needs_lowering, export_end, */
s2: %22:s[0-1], s1: %23:s[2], s1: %24:s[3], s1: %25:s[4], v2: %26:v[0-1] = p_startpgm
s2: (kill)%28, s1: (kill)%27:scc = p_init_scratch (kill)%22, (latekill)(kill)%25
s2: %56 = p_parallelcopy %0:exec
s2: %0:exec, s1: (kill)%57:scc = s_wqm_b64 %56
p_logical_start
v1: %29, v1: %30 = p_split_vector (kill)%26
s2: %4 = p_create_vector (kill)%23, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %45 = v_interp_p1_f32 %29, %24:m0 attr0.y
v1: %9 = v_interp_p2_f32 %30, %24:m0, (kill)%45 attr0.y
v1: %46 = v_interp_p1_f32 (kill)%29, %24:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%30, (kill)%24:m0, (kill)%46 attr0.x
v1: %48 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %47 = p_wqm (kill)%48
s2: %13, s1: (kill)%58:scc = s_andn2_b64 %0:exec, %56
v1: %15 = v_cndmask_b32 %47, 0x7f800000, (kill)%13
lv1: %54 = p_start_linear_vgpr
lv1: %55 = p_start_linear_vgpr
s1: %50, s2: (kill)%51, s1: (kill)%52:scc = p_reduce (kill)%15, (kill)%54, (kill)%55 op:fmin32 cluster_size:64
s1: %16 = p_wqm (kill)%50
v1: %17 = v_add_f32 (kill)%16, (kill)%47
v1: %19 = v_cvt_pkrtz_f16_f32 (kill)%17, 0
s2: %0:exec = p_parallelcopy (kill)%56
exp (kill)%19, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_mov_b64 s[0:1], exec ; be80047e
s_wqm_b64 exec, s[0:1] ; befe0a00
s_mov_b32 s4, s3 ; be840303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040
s_mov_b32 m0, s4 ; befc0304
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002
s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e
s_waitcnt vmcnt(0) ; bf8c3f70
v_cndmask_b32_e64 v3, v0, 0x7f800000, s[2:3] ; d5010003 0009ff00 7f800000
s_or_saveexec_b64 s[4:5], -1 ; be8425c1
v_mov_b32_e32 v1, 0x7f800000 ; 7e0202ff 7f800000
v_cndmask_b32_e64 v1, v1, v3, s[4:5] ; d5010001 00120701
v_min_f32_dpp v1, v1, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff04b101
v_min_f32_dpp v1, v1, v1 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff044e01
v_min_f32_dpp v1, v1, v1 row_half_mirror row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff054101
v_min_f32_dpp v1, v1, v1 row_mirror row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff054001
v_permlanex16_b32 v2, v1, 0, 0 ; d7780002 02010101
v_min_f32_e32 v1, v1, v2 ; 1e020501
v_readlane_b32 s2, v1, 0 ; d7600002 00010101
v_min_f32_e32 v1, s2, v1 ; 1e020202
s_mov_b64 exec, s[4:5] ; befe0404
v_readlane_b32 s2, v1, 63 ; d7600002 00017f01
v_add_f32_e32 v0, s2, v0 ; 06000002
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
s_mov_b64 exec, s[0:1] ; befe0400
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 14036544613523326444
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 208
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 1465341766
Instructions (Instruction count): 35
Copies (Copy instructions created for pseudo-instructions): 5
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 462
Inverse Throughput (Estimated busy cycles to execute one wave): 36
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 15
Pre-Sched VGPRs (VGPR usage before scheduling): 4
Building WaveIsFirstLane
== With helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0xb38253d4, 0x2a6b4312, 0xb4b4a853, 0xac7cbb67, 0xc10fe263}
stage: 4
next_stage: 0
num_textures: 2
num_images: 1
inputs_read: 32
outputs_written: 4
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_int: 0x20
separate_shader: true
writes_memory: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var image INTERP_MODE_NONE restrict writeonly r32_uint uimageBuffer @1 (~0, 0, 2)
decl_var uniform INTERP_MODE_NONE restrict sampler @2 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@3 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 1 div ssa_12 = intrinsic elect () ()
/* succs: block_1 block_2 */
if ssa_12 {
block block_1:
/* preds: block_0 */
vec1 32 con ssa_13 = load_const (0x00000050 = 0.000000)
vec4 32 con ssa_14 = intrinsic load_smem_amd (ssa_3, ssa_13) (align_mul=16, align_offset=0)
vec1 32 con ssa_15 = undefined
vec4 32 con ssa_16 = vec4 ssa_4, ssa_15, ssa_15, ssa_15
vec1 32 con ssa_17 = undefined
vec1 32 con ssa_18 = load_const (0x000003e8 = 0.000000)
intrinsic bindless_image_store (ssa_14, ssa_16, ssa_17, ssa_18, ssa_4) (image_dim=Buf, image_array=false, format=r32_uint, access=10, src_type=uint32)
/* succs: block_3 */
} else {
block block_2:
/* preds: block_0 */
/* succs: block_3 */
}
block block_3:
/* preds: block_1 block_2 */
vec1 32 con ssa_19 = undefined
vec1 32 div ssa_20 = pack_half_2x16_split ssa_11.x, ssa_19
vec1 32 con ssa_21 = undefined
vec4 32 div ssa_22 = vec4 ssa_20, ssa_21, ssa_21, ssa_21
intrinsic export_amd (ssa_22) (base=0, wrmask=xy, flags=7)
/* succs: block_4 */
block block_4:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: top-level, branch, needs_lowering, */
s2: %24:s[0-1], s1: %25:s[2], s1: %26:s[3], s1: %27:s[4], v2: %28:v[0-1] = p_startpgm
s2: (kill)%30, s1: (kill)%29:scc = p_init_scratch (kill)%24, (latekill)(kill)%27
s2: %65 = p_parallelcopy %0:exec
s2: %0:exec, s1: (kill)%66:scc = s_wqm_b64 %65
p_logical_start
v1: %31, v1: %32 = p_split_vector (kill)%28
s2: %4 = p_create_vector (kill)%25, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 %4, 64
v1: %47 = v_interp_p1_f32 %31, %26:m0 attr0.y
v1: %9 = v_interp_p2_f32 %32, %26:m0, (kill)%47 attr0.y
v1: %48 = v_interp_p1_f32 (kill)%31, %26:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%32, (kill)%26:m0, (kill)%48 attr0.x
v1: %50 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %49 = p_wqm (kill)%50
s1: %67 = s_ff1_i32_b64 %0:exec
s2: %51, s1: (kill)%68:scc = s_lshl_b64 1, (kill)%67
s2: %13 = p_wqm (kill)%51
p_logical_end
s2: %0:exec = p_parallelcopy (kill)%65
s2: %70, s1: (kill)%69:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%13, %0:exec
s2: (kill)%71 = p_cbranch_z %0:exec BB2, BB1
BB1
/* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
p_logical_start
s4: %15 = s_load_dwordx4 (kill)%4, 0x50
v1: %57 = p_parallelcopy 0x3e8
v1: %58 = p_parallelcopy 0
buffer_store_format_x (kill)%15, (kill)%58, 0, (kill)%57 idxen disable_wqm storage:image
p_logical_end
s2: (kill)%59 = p_branch BB3
BB2
/* logical preds: / linear preds: BB0, / kind: uniform, */
s2: (kill)%60 = p_branch BB3
BB3
/* logical preds: / linear preds: BB1, BB2, / kind: invert, */
s2: %0:exec, s1: (kill)%72:scc = s_andn2_b64 %70, %0:exec
s2: (kill)%73 = p_cbranch_z %0:exec BB5, BB4
BB4
/* logical preds: BB0, / linear preds: BB3, / kind: uniform, */
p_logical_start
p_logical_end
s2: (kill)%62 = p_branch BB6
BB5
/* logical preds: / linear preds: BB3, / kind: uniform, */
s2: (kill)%63 = p_branch BB6
BB6
/* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, export_end, */
s2: %0:exec = p_parallelcopy (kill)%70
p_logical_start
v1: %21 = v_cvt_pkrtz_f16_f32 (kill)%49, 0
exp (kill)%21, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_mov_b64 s[0:1], exec ; be80047e
s_wqm_b64 exec, s[0:1] ; befe0a00
s_mov_b32 s4, s3 ; be840303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040
s_mov_b32 m0, s4 ; befc0304
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002
s_ff1_i32_b64 s4, exec ; be84147e
s_lshl_b64 s[4:5], 1, s4 ; 8f840481
s_mov_b64 exec, s[0:1] ; befe0400
s_and_saveexec_b64 s[0:1], s[4:5] ; be802404
s_cbranch_execz BB6 ; bf880008
BB1:
s_load_dwordx4 s[4:7], s[2:3], 0x50 ; f4080101 fa000050
v_mov_b32_e32 v1, 0x3e8 ; 7e0202ff 000003e8
v_mov_b32_e32 v2, 0 ; 7e040280
s_waitcnt lgkmcnt(0) ; bf8cc07f
buffer_store_format_x v1, v2, s[4:7], 0 idxen ; e0102000 80010102
BB6:
s_mov_b64 exec, s[0:1] ; befe0400
s_waitcnt vmcnt(0) ; bf8c3f70
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 10877422663015167528
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 152
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 1449320740
Instructions (Instruction count): 29
Copies (Copy instructions created for pseudo-instructions): 8
Branches (Branch instructions): 1
Latency (Issue cycles plus stall cycles): 413
Inverse Throughput (Estimated busy cycles to execute one wave): 26
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 2
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 2
Pre-Sched SGPRs (SGPR usage before scheduling): 17
Pre-Sched VGPRs (VGPR usage before scheduling): 3
== Without helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0x3293d9f2, 0x7f34506e, 0x445f4c15, 0x918f9d3e, 0x589000fa}
stage: 4
next_stage: 0
num_textures: 2
num_images: 1
inputs_read: 32
outputs_written: 4
system_values_read: 0x00000000'00000000'08000000
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_int: 0x20
separate_shader: true
writes_memory: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var image INTERP_MODE_NONE restrict writeonly r32_uint uimageBuffer @1 (~0, 0, 2)
decl_var uniform INTERP_MODE_NONE restrict sampler @2 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@3 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 1 div ssa_12 = intrinsic load_helper_invocation () ()
/* succs: block_1 block_2 */
if ssa_12 {
block block_1:
/* preds: block_0 */
vec1 1 con ssa_13 = load_const (false)
/* succs: block_3 */
} else {
block block_2:
/* preds: block_0 */
vec1 1 div ssa_14 = intrinsic elect () ()
/* succs: block_3 */
}
block block_3:
/* preds: block_1 block_2 */
vec1 1 div ssa_15 = phi block_1: ssa_13, block_2: ssa_14
/* succs: block_4 block_5 */
if ssa_15 {
block block_4:
/* preds: block_3 */
vec1 32 con ssa_16 = load_const (0x00000050 = 0.000000)
vec4 32 con ssa_17 = intrinsic load_smem_amd (ssa_3, ssa_16) (align_mul=16, align_offset=0)
vec1 32 con ssa_18 = undefined
vec4 32 con ssa_19 = vec4 ssa_4, ssa_18, ssa_18, ssa_18
vec1 32 con ssa_20 = undefined
vec1 32 con ssa_21 = load_const (0x000003e8 = 0.000000)
intrinsic bindless_image_store (ssa_17, ssa_19, ssa_20, ssa_21, ssa_4) (image_dim=Buf, image_array=false, format=r32_uint, access=10, src_type=uint32)
/* succs: block_6 */
} else {
block block_5:
/* preds: block_3 */
/* succs: block_6 */
}
block block_6:
/* preds: block_4 block_5 */
vec1 32 con ssa_22 = undefined
vec1 32 div ssa_23 = pack_half_2x16_split ssa_11.x, ssa_22
vec1 32 con ssa_24 = undefined
vec4 32 div ssa_25 = vec4 ssa_23, ssa_24, ssa_24, ssa_24
intrinsic export_amd (ssa_25) (base=0, wrmask=xy, flags=7)
/* succs: block_7 */
block block_7:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: top-level, branch, needs_lowering, */
s2: %27:s[0-1], s1: %28:s[2], s1: %29:s[3], s1: %30:s[4], v2: %31:v[0-1] = p_startpgm
s2: (kill)%33, s1: (kill)%32:scc = p_init_scratch (kill)%27, (latekill)(kill)%30
s2: %77 = p_parallelcopy %0:exec
s2: %0:exec, s1: (kill)%78:scc = s_wqm_b64 %77
p_logical_start
v1: %34, v1: %35 = p_split_vector (kill)%31
s2: %4 = p_create_vector (kill)%28, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 %4, 64
v1: %50 = v_interp_p1_f32 %34, %29:m0 attr0.y
v1: %9 = v_interp_p2_f32 %35, %29:m0, (kill)%50 attr0.y
v1: %51 = v_interp_p1_f32 (kill)%34, %29:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%35, (kill)%29:m0, (kill)%51 attr0.x
v1: %53 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %52 = p_wqm (kill)%53
s2: %13, s1: (kill)%79:scc = s_andn2_b64 %0:exec, %77
p_logical_end
s2: %81, s1: (kill)%80:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%13, %0:exec
s2: (kill)%82 = p_cbranch_z %0:exec BB2, BB1
BB1
/* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
p_logical_start
p_logical_end
s2: (kill)%55 = p_branch BB3
BB2
/* logical preds: / linear preds: BB0, / kind: uniform, */
s2: (kill)%56 = p_branch BB3
BB3
/* logical preds: / linear preds: BB1, BB2, / kind: invert, */
s2: %75 = p_linear_phi 0, s2: undef
s2: %0:exec, s1: (kill)%83:scc = s_andn2_b64 (kill)%81, %0:exec
s2: (kill)%84 = p_cbranch_z %0:exec BB5, BB4
BB4
/* logical preds: BB0, / linear preds: BB3, / kind: uniform, needs_lowering, */
p_logical_start
s1: %85 = s_ff1_i32_b64 %0:exec
s2: %58, s1: (kill)%86:scc = s_lshl_b64 1, (kill)%85
s2: %15 = p_wqm (kill)%58
s2: %74, s1: (kill)%76:scc = s_and_b64 (kill)%15, %0:exec
p_logical_end
s2: (kill)%59 = p_branch BB6
BB5
/* logical preds: / linear preds: BB3, / kind: uniform, */
s2: (kill)%60 = p_branch BB6
BB6
/* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: top-level, branch, merge, */
s2: %16 = p_linear_phi (kill)%74, (kill)%75
s2: %0:exec = p_parallelcopy (kill)%77
p_logical_start
p_logical_end
s2: %88, s1: (kill)%87:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%16, %0:exec
s2: (kill)%89 = p_cbranch_z %0:exec BB8, BB7
BB7
/* logical preds: BB6, / linear preds: BB6, / kind: uniform, */
p_logical_start
s4: %18 = s_load_dwordx4 (kill)%4, 0x50
v1: %66 = p_parallelcopy 0x3e8
v1: %67 = p_parallelcopy 0
buffer_store_format_x (kill)%18, (kill)%67, 0, (kill)%66 idxen disable_wqm storage:image
p_logical_end
s2: (kill)%68 = p_branch BB9
BB8
/* logical preds: / linear preds: BB6, / kind: uniform, */
s2: (kill)%69 = p_branch BB9
BB9
/* logical preds: / linear preds: BB7, BB8, / kind: invert, */
s2: %0:exec, s1: (kill)%90:scc = s_andn2_b64 %88, %0:exec
s2: (kill)%91 = p_cbranch_z %0:exec BB11, BB10
BB10
/* logical preds: BB6, / linear preds: BB9, / kind: uniform, */
p_logical_start
p_logical_end
s2: (kill)%71 = p_branch BB12
BB11
/* logical preds: / linear preds: BB9, / kind: uniform, */
s2: (kill)%72 = p_branch BB12
BB12
/* logical preds: BB7, BB10, / linear preds: BB10, BB11, / kind: uniform, top-level, merge, export_end, */
s2: %0:exec = p_parallelcopy (kill)%88
p_logical_start
v1: %24 = v_cvt_pkrtz_f16_f32 (kill)%52, 0
exp (kill)%24, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_mov_b64 s[0:1], exec ; be80047e
s_wqm_b64 exec, s[0:1] ; befe0a00
s_mov_b32 s4, s3 ; be840303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040
s_mov_b32 m0, s4 ; befc0304
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002
s_andn2_b64 s[4:5], exec, s[0:1] ; 8a84007e
s_and_saveexec_b64 s[4:5], s[4:5] ; be842404
BB1:
s_mov_b64 s[6:7], 0 ; be860480
BB3:
s_andn2_b64 exec, s[4:5], exec ; 8afe7e04
BB4:
s_ff1_i32_b64 s4, exec ; be84147e
s_lshl_b64 s[4:5], 1, s4 ; 8f840481
s_and_b64 s[6:7], s[4:5], exec ; 87867e04
BB6:
s_mov_b64 exec, s[0:1] ; befe0400
s_and_saveexec_b64 s[0:1], s[6:7] ; be802406
s_cbranch_execz BB12 ; bf880008
BB7:
s_load_dwordx4 s[4:7], s[2:3], 0x50 ; f4080101 fa000050
v_mov_b32_e32 v1, 0x3e8 ; 7e0202ff 000003e8
v_mov_b32_e32 v2, 0 ; 7e040280
s_waitcnt lgkmcnt(0) ; bf8cc07f
buffer_store_format_x v1, v2, s[4:7], 0 idxen ; e0102000 80010102
BB12:
s_mov_b64 exec, s[0:1] ; befe0400
s_waitcnt vmcnt(0) ; bf8c3f70
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 10816365857869393798
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 172
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 3977462287
Instructions (Instruction count): 34
Copies (Copy instructions created for pseudo-instructions): 9
Branches (Branch instructions): 1
Latency (Issue cycles plus stall cycles): 418
Inverse Throughput (Estimated busy cycles to execute one wave): 26
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 2
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 2
Pre-Sched SGPRs (SGPR usage before scheduling): 17
Pre-Sched VGPRs (VGPR usage before scheduling): 3
Building WavePrefixProduct
== With helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0x0baabb81, 0x192bff95, 0x98e9ff9b, 0x32e96625, 0x18f79b31}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x20
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 32 div ssa_12 = mov ssa_11.x
vec1 32 div ssa_13 = intrinsic exclusive_scan (ssa_12) (reduction_op=fmul)
vec1 32 div ssa_14 = fadd ssa_11.x, ssa_13
vec1 32 con ssa_15 = undefined
vec1 32 div ssa_16 = pack_half_2x16_split ssa_14, ssa_15
vec1 32 con ssa_17 = undefined
vec4 32 div ssa_18 = vec4 ssa_16, ssa_17, ssa_17, ssa_17
intrinsic export_amd (ssa_18) (base=0, wrmask=xy, flags=7)
/* succs: block_1 */
block block_1:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: uniform, top-level, export_end, */
s2: %20:s[0-1], s1: %21:s[2], s1: %22:s[3], s1: %23:s[4], v2: %24:v[0-1] = p_startpgm
s2: (kill)%26, s1: (kill)%25:scc = p_init_scratch (kill)%20, (latekill)(kill)%23
s2: %0:exec, s1: (kill)%54:scc = s_wqm_b64 %0:exec
p_logical_start
v1: %27, v1: %28 = p_split_vector (kill)%24
s2: %4 = p_create_vector (kill)%21, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %43 = v_interp_p1_f32 %27, %22:m0 attr0.y
v1: %9 = v_interp_p2_f32 %28, %22:m0, (kill)%43 attr0.y
v1: %44 = v_interp_p1_f32 (kill)%27, %22:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%28, (kill)%22:m0, (kill)%44 attr0.x
v1: %46 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %45 = p_wqm (kill)%46
lv1: %52 = p_start_linear_vgpr
lv1: %53 = p_start_linear_vgpr
v1: %47, s2: (kill)%48, s1: (kill)%49, s1: (kill)%50:scc = p_exclusive_scan %45, (kill)%52, (kill)%53 op:fmul32 cluster_size:64
v1: %14 = p_wqm (kill)%47
v1: %15 = v_add_f32 (kill)%45, (kill)%14
v1: %17 = v_cvt_pkrtz_f16_f32 (kill)%15, 0
exp (kill)%17, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_wqm_b64 exec, exec ; befe0a7e
s_mov_b32 s0, s3 ; be800303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040
s_mov_b32 m0, s0 ; befc0300
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002
s_or_saveexec_b64 s[0:1], -1 ; be8025c1
s_waitcnt vmcnt(0) ; bf8c3f70
v_cndmask_b32_e64 v1, 1.0, v0, s[0:1] ; d5010001 000200f2
v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; 7e0402fa ff0d1101
s_mov_b32 exec_lo, 0x10000 ; befe03ff 00010000
s_mov_b32 exec_hi, 0x10000 ; beff03ff 00010000
v_permlanex16_b32 v2, v1, -1, -1 op_sel:[1,0] ; d7780802 03058301
s_mov_b64 exec, -1 ; befe04c1
v_readlane_b32 s2, v1, 31 ; d7600002 00013f01
v_writelane_b32 v2, s2, 32 ; d7610002 00014002
v_writelane_b32 v2, 1.0, 0 ; d7610002 000100f2
v_mul_f32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051102
v_mul_f32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051202
v_mul_f32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051402
v_mul_f32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051802
s_bfm_b32 exec_lo, 16, 16 ; 927e9090
s_bfm_b32 exec_hi, 16, 16 ; 927f9090
v_permlanex16_b32 v1, v2, -1, -1 op_sel:[1,0] ; d7780801 03058302
v_mul_f32_e32 v2, v2, v1 ; 10040302
s_bfm_b64 exec, 32, 32 ; 92fea0a0
v_readlane_b32 s2, v2, 31 ; d7600002 00013f02
v_mul_f32_e32 v2, s2, v2 ; 10040402
s_mov_b64 exec, s[0:1] ; befe0400
v_mov_b32_e32 v1, v2 ; 7e020302
v_add_f32_e32 v0, v0, v1 ; 06000300
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 13476569651227853137
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 244
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 4082983596
Instructions (Instruction count): 41
Copies (Copy instructions created for pseudo-instructions): 3
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 476
Inverse Throughput (Estimated busy cycles to execute one wave): 42
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 13
Pre-Sched VGPRs (VGPR usage before scheduling): 3
== Without helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0xa73cb651, 0x947d7cfc, 0xbb461f1d, 0xf1ea3766, 0xdd013638}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
system_values_read: 0x00000000'00000000'08000000
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x21
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 1 div ssa_12 = intrinsic load_helper_invocation () ()
vec1 32 con ssa_13 = load_const (0x3f800000 = 1.000000)
vec1 32 div ssa_14 = bcsel ssa_12, ssa_13, ssa_11.x
vec1 32 div ssa_15 = intrinsic exclusive_scan (ssa_14) (reduction_op=fmul)
vec1 32 div ssa_16 = fadd ssa_11.x, ssa_15
vec1 32 con ssa_17 = undefined
vec1 32 div ssa_18 = pack_half_2x16_split ssa_16, ssa_17
vec1 32 con ssa_19 = undefined
vec4 32 div ssa_20 = vec4 ssa_18, ssa_19, ssa_19, ssa_19
intrinsic export_amd (ssa_20) (base=0, wrmask=xy, flags=7)
/* succs: block_1 */
block block_1:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: uniform, top-level, needs_lowering, export_end, */
s2: %22:s[0-1], s1: %23:s[2], s1: %24:s[3], s1: %25:s[4], v2: %26:v[0-1] = p_startpgm
s2: (kill)%28, s1: (kill)%27:scc = p_init_scratch (kill)%22, (latekill)(kill)%25
s2: %57 = p_parallelcopy %0:exec
s2: %0:exec, s1: (kill)%58:scc = s_wqm_b64 %57
p_logical_start
v1: %29, v1: %30 = p_split_vector (kill)%26
s2: %4 = p_create_vector (kill)%23, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %45 = v_interp_p1_f32 %29, %24:m0 attr0.y
v1: %9 = v_interp_p2_f32 %30, %24:m0, (kill)%45 attr0.y
v1: %46 = v_interp_p1_f32 (kill)%29, %24:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%30, (kill)%24:m0, (kill)%46 attr0.x
v1: %48 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %47 = p_wqm (kill)%48
s2: %13, s1: (kill)%59:scc = s_andn2_b64 %0:exec, %57
v1: %15 = v_cndmask_b32 %47, 1.0, (kill)%13
lv1: %55 = p_start_linear_vgpr
lv1: %56 = p_start_linear_vgpr
v1: %50, s2: (kill)%51, s1: (kill)%52, s1: (kill)%53:scc = p_exclusive_scan (kill)%15, (kill)%55, (kill)%56 op:fmul32 cluster_size:64
v1: %16 = p_wqm (kill)%50
v1: %17 = v_add_f32 (kill)%47, (kill)%16
v1: %19 = v_cvt_pkrtz_f16_f32 (kill)%17, 0
s2: %0:exec = p_parallelcopy (kill)%57
exp (kill)%19, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_mov_b64 s[0:1], exec ; be80047e
s_wqm_b64 exec, s[0:1] ; befe0a00
s_mov_b32 s4, s3 ; be840303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040
s_mov_b32 m0, s4 ; befc0304
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002
s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e
s_waitcnt vmcnt(0) ; bf8c3f70
v_cndmask_b32_e64 v3, v0, 1.0, s[2:3] ; d5010003 0009e500
s_or_saveexec_b64 s[2:3], -1 ; be8225c1
v_cndmask_b32_e64 v1, 1.0, v3, s[2:3] ; d5010001 000a06f2
v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; 7e0402fa ff0d1101
s_mov_b32 exec_lo, 0x10000 ; befe03ff 00010000
s_mov_b32 exec_hi, 0x10000 ; beff03ff 00010000
v_permlanex16_b32 v2, v1, -1, -1 op_sel:[1,0] ; d7780802 03058301
s_mov_b64 exec, -1 ; befe04c1
v_readlane_b32 s4, v1, 31 ; d7600004 00013f01
v_writelane_b32 v2, s4, 32 ; d7610002 00014004
v_writelane_b32 v2, 1.0, 0 ; d7610002 000100f2
v_mul_f32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051102
v_mul_f32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051202
v_mul_f32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051402
v_mul_f32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051802
s_bfm_b32 exec_lo, 16, 16 ; 927e9090
s_bfm_b32 exec_hi, 16, 16 ; 927f9090
v_permlanex16_b32 v1, v2, -1, -1 op_sel:[1,0] ; d7780801 03058302
v_mul_f32_e32 v2, v2, v1 ; 10040302
s_bfm_b64 exec, 32, 32 ; 92fea0a0
v_readlane_b32 s4, v2, 31 ; d7600004 00013f02
v_mul_f32_e32 v2, s4, v2 ; 10040404
s_mov_b64 exec, s[2:3] ; befe0402
v_mov_b32_e32 v1, v2 ; 7e020302
v_add_f32_e32 v0, v0, v1 ; 06000300
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
s_mov_b64 exec, s[0:1] ; befe0400
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 14392008431707320378
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 264
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 2093754719
Instructions (Instruction count): 45
Copies (Copy instructions created for pseudo-instructions): 5
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 484
Inverse Throughput (Estimated busy cycles to execute one wave): 44
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 15
Pre-Sched VGPRs (VGPR usage before scheduling): 4
Building WaveReadLaneFirst
== With helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0x4941eebc, 0x7b751fe9, 0x24b16f74, 0x2275d816, 0xfafc5351}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x20
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 32 div ssa_12 = mov ssa_11.x
vec1 32 con ssa_13 = intrinsic read_first_invocation (ssa_12) ()
vec1 32 div ssa_14 = fadd ssa_11.x, ssa_13
vec1 32 con ssa_15 = undefined
vec1 32 div ssa_16 = pack_half_2x16_split ssa_14, ssa_15
vec1 32 con ssa_17 = undefined
vec4 32 div ssa_18 = vec4 ssa_16, ssa_17, ssa_17, ssa_17
intrinsic export_amd (ssa_18) (base=0, wrmask=xy, flags=7)
/* succs: block_1 */
block block_1:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: uniform, top-level, export_end, */
s2: %20:s[0-1], s1: %21:s[2], s1: %22:s[3], s1: %23:s[4], v2: %24:v[0-1] = p_startpgm
s2: (kill)%26, s1: (kill)%25:scc = p_init_scratch (kill)%20, (latekill)(kill)%23
s2: %0:exec, s1: (kill)%49:scc = s_wqm_b64 %0:exec
p_logical_start
v1: %27, v1: %28 = p_split_vector (kill)%24
s2: %4 = p_create_vector (kill)%21, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %43 = v_interp_p1_f32 %27, %22:m0 attr0.y
v1: %9 = v_interp_p2_f32 %28, %22:m0, (kill)%43 attr0.y
v1: %44 = v_interp_p1_f32 (kill)%27, %22:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%28, (kill)%22:m0, (kill)%44 attr0.x
v1: %46 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %45 = p_wqm (kill)%46
s1: %47 = v_readfirstlane_b32 %45
s1: %14 = p_wqm (kill)%47
v1: %15 = v_add_f32 (kill)%14, (kill)%45
v1: %17 = v_cvt_pkrtz_f16_f32 (kill)%15, 0
exp (kill)%17, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_wqm_b64 exec, exec ; befe0a7e
s_mov_b32 s0, s3 ; be800303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040
s_mov_b32 m0, s0 ; befc0300
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002
s_waitcnt vmcnt(0) ; bf8c3f70
v_readfirstlane_b32 s0, v0 ; 7e000500
v_add_f32_e32 v0, s0, v0 ; 06000000
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 9810138321964705697
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 100
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 1513780926
Instructions (Instruction count): 19
Copies (Copy instructions created for pseudo-instructions): 3
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 397
Inverse Throughput (Estimated busy cycles to execute one wave): 25
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 13
Pre-Sched VGPRs (VGPR usage before scheduling): 3
== Without helper lanes ==
Representation: NIR Shader(s) (The optimized NIR shader(s))
shader: MESA_SHADER_FRAGMENT
source_sha1: {0x7549a71c, 0x220294d6, 0xcdfed24c, 0x4c8bf6ea, 0x99e0b609}
stage: 4
next_stage: 0
num_textures: 2
inputs_read: 32
outputs_written: 4
system_values_read: 0x00000000'00000000'08000000
subgroup_size: 0
uses_wide_subgroup_intrinsics: true
divergence_analysis_run: true
bit_sizes_float: 0x20
bit_sizes_int: 0x20
separate_shader: true
needs_quad_helper_invocations: true
needs_all_helper_invocations: true
origin_upper_left: true
inputs: 1
outputs: 0
uniforms: 0
decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0)
decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1)
decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0)
decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0)
vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0)
vec1 32 con ssa_2 = load_const (0xffff8000 = -nan)
vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2
vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000)
vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0)
vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000)
vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0)
vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */
vec2 32 div ssa_10 = vec2 ssa_9, ssa_8
vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler)
vec1 1 div ssa_12 = intrinsic load_helper_invocation () ()
/* succs: block_1 block_2 */
if ssa_12 {
block block_1:
/* preds: block_0 */
vec1 32 con ssa_13 = undefined
/* succs: block_3 */
} else {
block block_2:
/* preds: block_0 */
vec1 32 div ssa_14 = mov ssa_11.x
vec1 32 con ssa_15 = intrinsic read_first_invocation (ssa_14) ()
/* succs: block_3 */
}
block block_3:
/* preds: block_1 block_2 */
vec1 32 con ssa_16 = phi block_1: ssa_13, block_2: ssa_15
vec1 32 div ssa_17 = fadd ssa_11.x, ssa_16
vec1 32 con ssa_18 = undefined
vec1 32 div ssa_19 = pack_half_2x16_split ssa_17, ssa_18
vec1 32 con ssa_20 = undefined
vec4 32 div ssa_21 = vec4 ssa_19, ssa_20, ssa_20, ssa_20
intrinsic export_amd (ssa_21) (base=0, wrmask=xy, flags=7)
/* succs: block_4 */
block block_4:
}
Representation: ACO IR (The ACO IR after some optimizations)
After Spilling:
ACO shader stage: fragment_fs
BB0
/* logical preds: / linear preds: / kind: top-level, branch, needs_lowering, */
s2: %23:s[0-1], s1: %24:s[2], s1: %25:s[3], s1: %26:s[4], v2: %27:v[0-1] = p_startpgm
s2: (kill)%29, s1: (kill)%28:scc = p_init_scratch (kill)%23, (latekill)(kill)%26
s2: %58 = p_parallelcopy %0:exec
s2: %0:exec, s1: (kill)%59:scc = s_wqm_b64 %58
p_logical_start
v1: %30, v1: %31 = p_split_vector (kill)%27
s2: %4 = p_create_vector (kill)%24, 0xffff8000
s8: %6 = s_load_dwordx8 %4, 0
s4: %8 = s_load_dwordx4 (kill)%4, 64
v1: %46 = v_interp_p1_f32 %30, %25:m0 attr0.y
v1: %9 = v_interp_p2_f32 %31, %25:m0, (kill)%46 attr0.y
v1: %47 = v_interp_p1_f32 (kill)%30, %25:m0 attr0.x
v1: %10 = v_interp_p2_f32 (kill)%31, (kill)%25:m0, (kill)%47 attr0.x
v1: %49 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d
v1: %48 = p_wqm (kill)%49
s2: %13, s1: (kill)%60:scc = s_andn2_b64 %0:exec, %58
p_logical_end
s2: %62, s1: (kill)%61:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%13, %0:exec
s2: (kill)%63 = p_cbranch_z %0:exec BB2, BB1
BB1
/* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
p_logical_start
p_logical_end
s2: (kill)%51 = p_branch BB3
BB2
/* logical preds: / linear preds: BB0, / kind: uniform, */
s2: (kill)%52 = p_branch BB3
BB3
/* logical preds: / linear preds: BB1, BB2, / kind: invert, */
s2: %0:exec, s1: (kill)%64:scc = s_andn2_b64 (kill)%62, %0:exec
s2: (kill)%65 = p_cbranch_z %0:exec BB5, BB4
BB4
/* logical preds: BB0, / linear preds: BB3, / kind: uniform, */
p_logical_start
s1: %54 = v_readfirstlane_b32 %48
s1: %16 = p_wqm (kill)%54
p_logical_end
s2: (kill)%55 = p_branch BB6
BB5
/* logical preds: / linear preds: BB3, / kind: uniform, */
s2: (kill)%56 = p_branch BB6
BB6
/* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, export_end, */
s1: %17 = p_linear_phi (kill)%16, s1: undef
s2: %0:exec = p_parallelcopy (kill)%58
p_logical_start
v1: %18 = v_add_f32 (kill)%17, (kill)%48
v1: %20 = v_cvt_pkrtz_f16_f32 (kill)%18, 0
exp (kill)%20, v1: undef, v1: undef, v1: undef en:rg** compr mrt0
p_logical_end
s_endpgm
Representation: Assembly (Final Assembly)
BB0:
s_mov_b64 s[0:1], exec ; be80047e
s_wqm_b64 exec, s[0:1] ; befe0a00
s_mov_b32 s4, s3 ; be840303
s_movk_i32 s3, 0x8000 ; b0038000
s_clause 0x1 ; bfa10001
s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000
s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040
s_mov_b32 m0, s4 ; befc0304
v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100
v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101
v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000
v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001
s_waitcnt lgkmcnt(0) ; bf8cc07f
image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002
s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e
s_and_saveexec_b64 s[2:3], s[2:3] ; be822402
BB3:
s_andn2_b64 exec, s[2:3], exec ; 8afe7e02
BB4:
s_waitcnt vmcnt(0) ; bf8c3f70
v_readfirstlane_b32 s2, v0 ; 7e040500
BB6:
s_mov_b64 exec, s[0:1] ; befe0400
s_waitcnt vmcnt(0) ; bf8c3f70
v_add_f32_e32 v0, s2, v0 ; 06000002
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100
exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000
s_endpgm ; bf810000
Driver pipeline hash (Driver pipeline hash used by RGP): 16198948385471629174
SGPRs (Number of SGPR registers allocated per subgroup): 128
VGPRs (Number of VGPR registers allocated per subgroup): 8
Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0
Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0
Code size (Code size in bytes): 124
LDS size (LDS size in bytes per workgroup): 0
Scratch size (Private memory in bytes per subgroup): 0
Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32
Hash (CRC32 hash of code and constant data): 144664052
Instructions (Instruction count): 25
Copies (Copy instructions created for pseudo-instructions): 5
Branches (Branch instructions): 0
Latency (Issue cycles plus stall cycles): 632
Inverse Throughput (Estimated busy cycles to execute one wave): 39
VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1
SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1
Pre-Sched SGPRs (SGPR usage before scheduling): 15
Pre-Sched VGPRs (VGPR usage before scheduling): 3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment