Created
June 16, 2023 14:38
-
-
Save HansKristian-Work/611f7cdfb1d15c0b34fd1ada512af369 to your computer and use it in GitHub Desktop.
SM 6.0 strict wave op tests
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| struct VSOut | |
| { | |
| float4 pos : SV_Position; | |
| float2 uv : TEXCOORD; | |
| }; | |
| VSOut vs_main(float4 pos : POSITION) | |
| { | |
| VSOut vout; | |
| vout.pos = pos; | |
| vout.uv = pos.xy * 0.5 + 0.5; | |
| return vout; | |
| } | |
| Texture2D<float> T : register(t0); | |
| SamplerState S : register(s1); | |
| RWStructuredBuffer<uint> U : register(u2); | |
| // WaveReadLaneFirst | |
| float _WaveReadLaneFirst(VSOut vin) | |
| { | |
| float s = T.Sample(S, vin.uv); | |
| s += WaveReadLaneFirst(s); | |
| return s; | |
| } | |
| float sm60_WaveReadLaneFirst(VSOut vin) : SV_Target | |
| { | |
| return _WaveReadLaneFirst(vin); | |
| } | |
| [WaveOpsIncludeHelperLanes] | |
| float sm67_WaveReadLaneFirst(VSOut vin) : SV_Target | |
| { | |
| return _WaveReadLaneFirst(vin); | |
| } | |
| // WaveActiveAllEqual | |
| float _WaveActiveAllEqual(VSOut vin) | |
| { | |
| float s = T.Sample(S, vin.uv); | |
| s += float(WaveActiveAllEqual(s)); | |
| return s; | |
| } | |
| float sm60_WaveActiveAllEqual(VSOut vin) : SV_Target | |
| { | |
| return _WaveActiveAllEqual(vin); | |
| } | |
| [WaveOpsIncludeHelperLanes] | |
| float sm67_WaveActiveAllEqual(VSOut vin) : SV_Target | |
| { | |
| return _WaveActiveAllEqual(vin); | |
| } | |
| // WaveActiveCountBits | |
| float _WaveActiveCountBits(VSOut vin) | |
| { | |
| float s = T.Sample(S, vin.uv); | |
| s += float(WaveActiveCountBits(s)); | |
| return s; | |
| } | |
| float sm60_WaveActiveCountBits(VSOut vin) : SV_Target | |
| { | |
| return _WaveActiveCountBits(vin); | |
| } | |
| [WaveOpsIncludeHelperLanes] | |
| float sm67_WaveActiveCountBits(VSOut vin) : SV_Target | |
| { | |
| return _WaveActiveCountBits(vin); | |
| } | |
| // WaveActiveSum | |
| float _WaveActiveMin(VSOut vin) | |
| { | |
| float s = T.Sample(S, vin.uv); | |
| s += WaveActiveMin(s); | |
| return s; | |
| } | |
| float sm60_WaveActiveMin(VSOut vin) : SV_Target | |
| { | |
| return _WaveActiveMin(vin); | |
| } | |
| [WaveOpsIncludeHelperLanes] | |
| float sm67_WaveActiveMin(VSOut vin) : SV_Target | |
| { | |
| return _WaveActiveMin(vin); | |
| } | |
| // WavePrefixProduct | |
| float _WavePrefixProduct(VSOut vin) | |
| { | |
| float s = T.Sample(S, vin.uv); | |
| s += WavePrefixProduct(s); | |
| return s; | |
| } | |
| float sm60_WavePrefixProduct(VSOut vin) : SV_Target | |
| { | |
| return _WavePrefixProduct(vin); | |
| } | |
| [WaveOpsIncludeHelperLanes] | |
| float sm67_WavePrefixProduct(VSOut vin) : SV_Target | |
| { | |
| return _WavePrefixProduct(vin); | |
| } | |
| // WaveIsLaneFirst | |
| float _WaveIsFirstLane(VSOut vin) | |
| { | |
| float s = T.Sample(S, vin.uv); | |
| if (WaveIsFirstLane()) | |
| U[0] = 1000; | |
| return s; | |
| } | |
| float sm60_WaveIsFirstLane(VSOut vin) : SV_Target | |
| { | |
| return _WaveIsFirstLane(vin); | |
| } | |
| [WaveOpsIncludeHelperLanes] | |
| float sm67_WaveIsFirstLane(VSOut vin) : SV_Target | |
| { | |
| return _WaveIsFirstLane(vin); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| set -e | |
| run_test() { | |
| echo "Building $1" | |
| DXC=./external/dxc-build/bin/dxc | |
| DXIL_SPIRV=./cmake-build-release/dxil-spirv | |
| $DXC -Tvs_6_0 -Fo vert.dxil test.hlsl -E vs_main | |
| $DXC -Tps_6_0 -Fo frag60.dxil test.hlsl -E sm60_$1 | |
| $DXC -Tps_6_7 -Fo frag67.dxil test.hlsl -E sm67_$1 | |
| $DXIL_SPIRV --output vert.spv vert.dxil | |
| $DXIL_SPIRV --output frag60.spv frag60.dxil | |
| $DXIL_SPIRV --output frag67.spv frag67.dxil | |
| fossilize-synth --vert vert.spv --frag frag60.spv --output sm60.foz | |
| fossilize-synth --vert vert.spv --frag frag67.spv --output sm67.foz | |
| mkdir -p /tmp/sm60-output | |
| mkdir -p /tmp/sm67-output | |
| rm /tmp/sm60-output/* | |
| rm /tmp/sm67-output/* | |
| fossilize-disasm --target isa --output /tmp/sm60-output sm60.foz | |
| fossilize-disasm --target isa --output /tmp/sm67-output sm67.foz | |
| echo "== With helper lanes ==" | |
| cat /tmp/sm67-output/*.frag | |
| echo "== Without helper lanes ==" | |
| cat /tmp/sm60-output/*.frag | |
| } | |
| run_test WaveReadLaneFirst > WaveReadLaneFirst.txt | |
| run_test WaveActiveAllEqual > WaveActiveAllEqual.txt | |
| run_test WaveActiveCountBits > WaveActiveCountBits.txt | |
| run_test WaveActiveMin > WaveActiveMin.txt | |
| run_test WavePrefixProduct > WavePrefixProduct.txt | |
| run_test WaveIsFirstLane > WaveIsFirstLane.txt |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| == With helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0xac104c1f, 0x83f461d4, 0x782a7d10, 0x760c676a, 0x9f393254} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x21 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 32 div ssa_12 = mov ssa_11.x | |
| vec1 32 con ssa_13 = intrinsic read_first_invocation (ssa_12) () | |
| vec1 1 div ssa_14 = feq ssa_13, ssa_11.x | |
| vec1 1 con ssa_15 = intrinsic vote_all (ssa_14) () | |
| vec1 32 con ssa_16 = b2f32 ssa_15 | |
| vec1 32 div ssa_17 = fadd ssa_16, ssa_11.x | |
| vec1 32 con ssa_18 = undefined | |
| vec1 32 div ssa_19 = pack_half_2x16_split ssa_17, ssa_18 | |
| vec1 32 con ssa_20 = undefined | |
| vec4 32 div ssa_21 = vec4 ssa_19, ssa_20, ssa_20, ssa_20 | |
| intrinsic export_amd (ssa_21) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_1 */ | |
| block block_1: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: uniform, top-level, export_end, */ | |
| s2: %23:s[0-1], s1: %24:s[2], s1: %25:s[3], s1: %26:s[4], v2: %27:v[0-1] = p_startpgm | |
| s2: (kill)%29, s1: (kill)%28:scc = p_init_scratch (kill)%23, (latekill)(kill)%26 | |
| s2: %0:exec, s1: (kill)%61:scc = s_wqm_b64 %0:exec | |
| p_logical_start | |
| v1: %30, v1: %31 = p_split_vector (kill)%27 | |
| s2: %4 = p_create_vector (kill)%24, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %46 = v_interp_p1_f32 %30, %25:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %31, %25:m0, (kill)%46 attr0.y | |
| v1: %47 = v_interp_p1_f32 (kill)%30, %25:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%31, (kill)%25:m0, (kill)%47 attr0.x | |
| v1: %49 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %48 = p_wqm (kill)%49 | |
| s1: %50 = v_readfirstlane_b32 %48 | |
| s1: %14 = p_wqm (kill)%50 | |
| s2: %52 = v_cmp_neq_f32 (kill)%14, %48 | |
| s2: (kill)%54, s1: %53:scc = s_and_b64 (kill)%52, %0:exec | |
| s1: %55:scc = p_wqm (kill)%53 | |
| s2: %56 = s_cselect_b64 -1, 0, (kill)%55:scc | |
| s2: (kill)%16, s1: %57:scc = s_not_b64 (kill)%56 | |
| s1: %17 = s_mul_i32 1.0, (kill)%57 | |
| v1: %18 = v_add_f32 (kill)%17, (kill)%48 | |
| v1: %20 = v_cvt_pkrtz_f16_f32 (kill)%18, 0 | |
| exp (kill)%20, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_wqm_b64 exec, exec ; befe0a7e | |
| s_mov_b32 s0, s3 ; be800303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040 | |
| s_mov_b32 m0, s0 ; befc0300 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002 | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_readfirstlane_b32 s0, v0 ; 7e000500 | |
| v_cmp_neq_f32_e32 vcc, s0, v0 ; 7c1a0000 | |
| s_and_b64 s[0:1], vcc, exec ; 87807e6a | |
| s_cselect_b64 s[0:1], -1, 0 ; 858080c1 | |
| s_not_b64 s[0:1], s[0:1] ; be800800 | |
| s_mul_i32 s0, 1.0, src_scc ; 9300fdf2 | |
| v_add_f32_e32 v0, s0, v0 ; 06000000 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 13511583485773036569 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 120 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 2542461091 | |
| Instructions (Instruction count): 24 | |
| Copies (Copy instructions created for pseudo-instructions): 3 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 411 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 26 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 13 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 | |
| == Without helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0xf4e791ce, 0x809f4864, 0xda6780d1, 0xf4febaa1, 0x1a634f83} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| system_values_read: 0x00000000'00000000'08000000 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x21 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 1 div ssa_12 = intrinsic load_helper_invocation () () | |
| /* succs: block_1 block_2 */ | |
| if ssa_12 { | |
| block block_1: | |
| /* preds: block_0 */ | |
| vec1 1 con ssa_13 = undefined | |
| /* succs: block_3 */ | |
| } else { | |
| block block_2: | |
| /* preds: block_0 */ | |
| vec1 32 div ssa_14 = mov ssa_11.x | |
| vec1 32 con ssa_15 = intrinsic read_first_invocation (ssa_14) () | |
| vec1 1 div ssa_16 = feq ssa_15, ssa_11.x | |
| vec1 1 con ssa_17 = intrinsic vote_all (ssa_16) () | |
| /* succs: block_3 */ | |
| } | |
| block block_3: | |
| /* preds: block_1 block_2 */ | |
| vec1 1 con ssa_18 = phi block_1: ssa_13, block_2: ssa_17 | |
| vec1 32 con ssa_19 = b2f32 ssa_18 | |
| vec1 32 div ssa_20 = fadd ssa_19, ssa_11.x | |
| vec1 32 con ssa_21 = undefined | |
| vec1 32 div ssa_22 = pack_half_2x16_split ssa_20, ssa_21 | |
| vec1 32 con ssa_23 = undefined | |
| vec4 32 div ssa_24 = vec4 ssa_22, ssa_23, ssa_23, ssa_23 | |
| intrinsic export_amd (ssa_24) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_4 */ | |
| block block_4: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: top-level, branch, needs_lowering, */ | |
| s2: %26:s[0-1], s1: %27:s[2], s1: %28:s[3], s1: %29:s[4], v2: %30:v[0-1] = p_startpgm | |
| s2: (kill)%32, s1: (kill)%31:scc = p_init_scratch (kill)%26, (latekill)(kill)%29 | |
| s2: %70 = p_parallelcopy %0:exec | |
| s2: %0:exec, s1: (kill)%71:scc = s_wqm_b64 %70 | |
| p_logical_start | |
| v1: %33, v1: %34 = p_split_vector (kill)%30 | |
| s2: %4 = p_create_vector (kill)%27, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %49 = v_interp_p1_f32 %33, %28:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %34, %28:m0, (kill)%49 attr0.y | |
| v1: %50 = v_interp_p1_f32 (kill)%33, %28:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%34, (kill)%28:m0, (kill)%50 attr0.x | |
| v1: %52 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %51 = p_wqm (kill)%52 | |
| s2: %13, s1: (kill)%72:scc = s_andn2_b64 %0:exec, %70 | |
| p_logical_end | |
| s2: %74, s1: (kill)%73:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%13, %0:exec | |
| s2: (kill)%75 = p_cbranch_z %0:exec BB2, BB1 | |
| BB1 | |
| /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */ | |
| p_logical_start | |
| p_logical_end | |
| s2: (kill)%54 = p_branch BB3 | |
| BB2 | |
| /* logical preds: / linear preds: BB0, / kind: uniform, */ | |
| s2: (kill)%55 = p_branch BB3 | |
| BB3 | |
| /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ | |
| s2: %0:exec, s1: (kill)%76:scc = s_andn2_b64 (kill)%74, %0:exec | |
| s2: (kill)%77 = p_cbranch_z %0:exec BB5, BB4 | |
| BB4 | |
| /* logical preds: BB0, / linear preds: BB3, / kind: uniform, */ | |
| p_logical_start | |
| s1: %57 = v_readfirstlane_b32 %51 | |
| s1: %16 = p_wqm (kill)%57 | |
| s2: %59 = v_cmp_neq_f32 (kill)%16, %51 | |
| s2: (kill)%61, s1: %60:scc = s_and_b64 (kill)%59, %0:exec | |
| s1: %62:scc = p_wqm (kill)%60 | |
| s2: %63 = s_cselect_b64 -1, 0, (kill)%62:scc | |
| s2: %18, s1: (kill)%64:scc = s_not_b64 (kill)%63 | |
| p_logical_end | |
| s2: (kill)%65 = p_branch BB6 | |
| BB5 | |
| /* logical preds: / linear preds: BB3, / kind: uniform, */ | |
| s2: (kill)%66 = p_branch BB6 | |
| BB6 | |
| /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, export_end, */ | |
| s2: %19 = p_linear_phi (kill)%18, s2: undef | |
| s2: %0:exec = p_parallelcopy (kill)%70 | |
| p_logical_start | |
| s2: (kill)%68, s1: %67:scc = s_and_b64 (kill)%19, %0:exec | |
| s1: %20 = s_mul_i32 1.0, (kill)%67 | |
| v1: %21 = v_add_f32 (kill)%20, (kill)%51 | |
| v1: %23 = v_cvt_pkrtz_f16_f32 (kill)%21, 0 | |
| exp (kill)%23, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_mov_b64 s[0:1], exec ; be80047e | |
| s_wqm_b64 exec, s[0:1] ; befe0a00 | |
| s_mov_b32 s4, s3 ; be840303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040 | |
| s_mov_b32 m0, s4 ; befc0304 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002 | |
| s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e | |
| s_and_saveexec_b64 s[2:3], s[2:3] ; be822402 | |
| BB3: | |
| s_andn2_b64 exec, s[2:3], exec ; 8afe7e02 | |
| BB4: | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_readfirstlane_b32 s2, v0 ; 7e040500 | |
| v_cmp_neq_f32_e32 vcc, s2, v0 ; 7c1a0002 | |
| s_and_b64 s[2:3], vcc, exec ; 87827e6a | |
| s_cselect_b64 s[2:3], -1, 0 ; 858280c1 | |
| s_not_b64 s[2:3], s[2:3] ; be820802 | |
| BB6: | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| s_and_b64 s[0:1], s[2:3], exec ; 87807e02 | |
| s_mul_i32 s0, 1.0, src_scc ; 9300fdf2 | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_add_f32_e32 v0, s0, v0 ; 06000000 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 8436903613772059513 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 148 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 989438680 | |
| Instructions (Instruction count): 31 | |
| Copies (Copy instructions created for pseudo-instructions): 5 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 643 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 40 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 15 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Building WaveActiveCountBits | |
| == With helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0xce583982, 0xdf4de895, 0x64f1e9e3, 0xafc370e1, 0x3389e093} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x61 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 1 div ssa_12 = fneu! ssa_11.x, ssa_4 | |
| vec1 64 con ssa_13 = intrinsic ballot (ssa_12) () | |
| vec2 32 con ssa_14 = unpack_64_2x32 ssa_13 | |
| vec1 64 con ssa_15 = pack_64_2x32_split ssa_14.x, ssa_14.y | |
| vec1 32 con ssa_16 = bit_count ssa_15 | |
| vec1 32 con ssa_17 = u2f32 ssa_16 | |
| vec1 32 div ssa_18 = fadd ssa_17, ssa_11.x | |
| vec1 32 con ssa_19 = undefined | |
| vec1 32 div ssa_20 = pack_half_2x16_split ssa_18, ssa_19 | |
| vec1 32 con ssa_21 = undefined | |
| vec4 32 div ssa_22 = vec4 ssa_20, ssa_21, ssa_21, ssa_21 | |
| intrinsic export_amd (ssa_22) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_1 */ | |
| block block_1: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: uniform, top-level, export_end, */ | |
| s2: %24:s[0-1], s1: %25:s[2], s1: %26:s[3], s1: %27:s[4], v2: %28:v[0-1] = p_startpgm | |
| s2: (kill)%30, s1: (kill)%29:scc = p_init_scratch (kill)%24, (latekill)(kill)%27 | |
| s2: %0:exec, s1: (kill)%57:scc = s_wqm_b64 %0:exec | |
| p_logical_start | |
| v1: %31, v1: %32 = p_split_vector (kill)%28 | |
| s2: %4 = p_create_vector (kill)%25, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %47 = v_interp_p1_f32 %31, %26:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %32, %26:m0, (kill)%47 attr0.y | |
| v1: %48 = v_interp_p1_f32 (kill)%31, %26:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%32, (kill)%26:m0, (kill)%48 attr0.x | |
| v1: %50 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %49 = p_wqm (kill)%50 | |
| s2: %52 = v_cmp_neq_f32 0, %49 | |
| s2: %14 = p_wqm (kill)%52 | |
| s1: %17, s1: (kill)%55:scc = s_bcnt1_i32_b64 (kill)%14 | |
| v1: %18 = v_cvt_f32_u32 (kill)%17 | |
| v1: %19 = v_add_f32 (kill)%18, (kill)%49 | |
| v1: %21 = v_cvt_pkrtz_f16_f32 (kill)%19, 0 | |
| exp (kill)%21, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_wqm_b64 exec, exec ; befe0a7e | |
| s_mov_b32 s0, s3 ; be800303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040 | |
| s_mov_b32 m0, s0 ; befc0300 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002 | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_cmp_neq_f32_e32 vcc, 0, v0 ; 7c1a0080 | |
| s_bcnt1_i32_b64 s0, vcc ; be80106a | |
| v_cvt_f32_u32_e32 v1, s0 ; 7e020c00 | |
| v_add_f32_e32 v0, v1, v0 ; 06000101 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 18019751637580729524 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 108 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 703231431 | |
| Instructions (Instruction count): 21 | |
| Copies (Copy instructions created for pseudo-instructions): 3 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 405 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 25 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 13 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 | |
| == Without helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0x3e248076, 0x7c0b38d3, 0x7724c093, 0xba94b689, 0x075cef04} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| system_values_read: 0x00000000'00000000'08000000 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x61 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 1 div ssa_12 = intrinsic load_helper_invocation () () | |
| vec1 1 div ssa_13 = inot ssa_12 | |
| vec1 1 div ssa_14 = fneu! ssa_11.x, ssa_4 | |
| vec1 1 div ssa_15 = iand ssa_14, ssa_13 | |
| vec1 64 con ssa_16 = intrinsic ballot (ssa_15) () | |
| vec2 32 con ssa_17 = unpack_64_2x32 ssa_16 | |
| vec1 64 con ssa_18 = pack_64_2x32_split ssa_17.x, ssa_17.y | |
| vec1 32 con ssa_19 = bit_count ssa_18 | |
| vec1 32 con ssa_20 = u2f32 ssa_19 | |
| vec1 32 div ssa_21 = fadd ssa_20, ssa_11.x | |
| vec1 32 con ssa_22 = undefined | |
| vec1 32 div ssa_23 = pack_half_2x16_split ssa_21, ssa_22 | |
| vec1 32 con ssa_24 = undefined | |
| vec4 32 div ssa_25 = vec4 ssa_23, ssa_24, ssa_24, ssa_24 | |
| intrinsic export_amd (ssa_25) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_1 */ | |
| block block_1: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: uniform, top-level, needs_lowering, export_end, */ | |
| s2: %27:s[0-1], s1: %28:s[2], s1: %29:s[3], s1: %30:s[4], v2: %31:v[0-1] = p_startpgm | |
| s2: (kill)%33, s1: (kill)%32:scc = p_init_scratch (kill)%27, (latekill)(kill)%30 | |
| s2: %62 = p_parallelcopy %0:exec | |
| s2: %0:exec, s1: (kill)%63:scc = s_wqm_b64 %62 | |
| p_logical_start | |
| v1: %34, v1: %35 = p_split_vector (kill)%31 | |
| s2: %4 = p_create_vector (kill)%28, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %50 = v_interp_p1_f32 %34, %29:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %35, %29:m0, (kill)%50 attr0.y | |
| v1: %51 = v_interp_p1_f32 (kill)%34, %29:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%35, (kill)%29:m0, (kill)%51 attr0.x | |
| v1: %53 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %52 = p_wqm (kill)%53 | |
| s2: %13, s1: (kill)%64:scc = s_andn2_b64 %0:exec, %62 | |
| s2: %15 = v_cmp_neq_f32 0, %52 | |
| s2: %16, s1: (kill)%55:scc = s_andn2_b64 (kill)%15, (kill)%13 | |
| s2: %57, s1: (kill)%56:scc = s_and_b64 (kill)%16, %0:exec | |
| s2: %17 = p_wqm (kill)%57 | |
| s1: %20, s1: (kill)%60:scc = s_bcnt1_i32_b64 (kill)%17 | |
| v1: %21 = v_cvt_f32_u32 (kill)%20 | |
| v1: %22 = v_add_f32 (kill)%21, (kill)%52 | |
| v1: %24 = v_cvt_pkrtz_f16_f32 (kill)%22, 0 | |
| s2: %0:exec = p_parallelcopy (kill)%62 | |
| exp (kill)%24, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_mov_b64 s[0:1], exec ; be80047e | |
| s_wqm_b64 exec, s[0:1] ; befe0a00 | |
| s_mov_b32 s4, s3 ; be840303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040 | |
| s_mov_b32 m0, s4 ; befc0304 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002 | |
| s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_cmp_neq_f32_e32 vcc, 0, v0 ; 7c1a0080 | |
| s_andn2_b64 s[2:3], vcc, s[2:3] ; 8a82026a | |
| s_and_b64 s[2:3], s[2:3], exec ; 87827e02 | |
| s_bcnt1_i32_b64 s2, s[2:3] ; be821002 | |
| v_cvt_f32_u32_e32 v1, s2 ; 7e020c02 | |
| v_add_f32_e32 v0, v1, v0 ; 06000101 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 6386560257104097910 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 128 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 4280024007 | |
| Instructions (Instruction count): 26 | |
| Copies (Copy instructions created for pseudo-instructions): 5 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 411 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 26 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 15 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Building WaveActiveMin | |
| == With helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0xf1bea900, 0xc5951ac5, 0x5e86f07e, 0x460754e6, 0xef2be9aa} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x20 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 32 div ssa_12 = mov ssa_11.x | |
| vec1 32 con ssa_13 = intrinsic reduce (ssa_12) (reduction_op=fmin, cluster_size=0) | |
| vec1 32 div ssa_14 = fadd ssa_11.x, ssa_13 | |
| vec1 32 con ssa_15 = undefined | |
| vec1 32 div ssa_16 = pack_half_2x16_split ssa_14, ssa_15 | |
| vec1 32 con ssa_17 = undefined | |
| vec4 32 div ssa_18 = vec4 ssa_16, ssa_17, ssa_17, ssa_17 | |
| intrinsic export_amd (ssa_18) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_1 */ | |
| block block_1: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: uniform, top-level, export_end, */ | |
| s2: %20:s[0-1], s1: %21:s[2], s1: %22:s[3], s1: %23:s[4], v2: %24:v[0-1] = p_startpgm | |
| s2: (kill)%26, s1: (kill)%25:scc = p_init_scratch (kill)%20, (latekill)(kill)%23 | |
| s2: %0:exec, s1: (kill)%53:scc = s_wqm_b64 %0:exec | |
| p_logical_start | |
| v1: %27, v1: %28 = p_split_vector (kill)%24 | |
| s2: %4 = p_create_vector (kill)%21, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %43 = v_interp_p1_f32 %27, %22:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %28, %22:m0, (kill)%43 attr0.y | |
| v1: %44 = v_interp_p1_f32 (kill)%27, %22:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%28, (kill)%22:m0, (kill)%44 attr0.x | |
| v1: %46 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %45 = p_wqm (kill)%46 | |
| lv1: %51 = p_start_linear_vgpr | |
| lv1: %52 = p_start_linear_vgpr | |
| s1: %47, s2: (kill)%48, s1: (kill)%49:scc = p_reduce %45, (kill)%51, (kill)%52 op:fmin32 cluster_size:64 | |
| s1: %14 = p_wqm (kill)%47 | |
| v1: %15 = v_add_f32 (kill)%14, (kill)%45 | |
| v1: %17 = v_cvt_pkrtz_f16_f32 (kill)%15, 0 | |
| exp (kill)%17, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_wqm_b64 exec, exec ; befe0a7e | |
| s_mov_b32 s0, s3 ; be800303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040 | |
| s_mov_b32 m0, s0 ; befc0300 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002 | |
| s_or_saveexec_b64 s[2:3], -1 ; be8225c1 | |
| v_mov_b32_e32 v1, 0x7f800000 ; 7e0202ff 7f800000 | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_cndmask_b32_e64 v1, v1, v0, s[2:3] ; d5010001 000a0101 | |
| v_min_f32_dpp v1, v1, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff04b101 | |
| v_min_f32_dpp v1, v1, v1 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff044e01 | |
| v_min_f32_dpp v1, v1, v1 row_half_mirror row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff054101 | |
| v_min_f32_dpp v1, v1, v1 row_mirror row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff054001 | |
| v_permlanex16_b32 v2, v1, 0, 0 ; d7780002 02010101 | |
| v_min_f32_e32 v1, v1, v2 ; 1e020501 | |
| v_readlane_b32 s0, v1, 0 ; d7600000 00010101 | |
| v_min_f32_e32 v1, s0, v1 ; 1e020200 | |
| s_mov_b64 exec, s[2:3] ; befe0402 | |
| v_readlane_b32 s0, v1, 63 ; d7600000 00017f01 | |
| v_add_f32_e32 v0, s0, v0 ; 06000000 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 17299820867343601478 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 184 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 1489409235 | |
| Instructions (Instruction count): 31 | |
| Copies (Copy instructions created for pseudo-instructions): 3 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 451 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 34 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 13 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 | |
| == Without helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0x1ab71855, 0x6f50287d, 0xabdc20b6, 0x81c20f8c, 0xa55f34d4} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| system_values_read: 0x00000000'00000000'08000000 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x21 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 1 div ssa_12 = intrinsic load_helper_invocation () () | |
| vec1 32 con ssa_13 = load_const (0x7f800000 = inf) | |
| vec1 32 div ssa_14 = bcsel ssa_12, ssa_13, ssa_11.x | |
| vec1 32 con ssa_15 = intrinsic reduce (ssa_14) (reduction_op=fmin, cluster_size=0) | |
| vec1 32 div ssa_16 = fadd ssa_11.x, ssa_15 | |
| vec1 32 con ssa_17 = undefined | |
| vec1 32 div ssa_18 = pack_half_2x16_split ssa_16, ssa_17 | |
| vec1 32 con ssa_19 = undefined | |
| vec4 32 div ssa_20 = vec4 ssa_18, ssa_19, ssa_19, ssa_19 | |
| intrinsic export_amd (ssa_20) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_1 */ | |
| block block_1: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: uniform, top-level, needs_lowering, export_end, */ | |
| s2: %22:s[0-1], s1: %23:s[2], s1: %24:s[3], s1: %25:s[4], v2: %26:v[0-1] = p_startpgm | |
| s2: (kill)%28, s1: (kill)%27:scc = p_init_scratch (kill)%22, (latekill)(kill)%25 | |
| s2: %56 = p_parallelcopy %0:exec | |
| s2: %0:exec, s1: (kill)%57:scc = s_wqm_b64 %56 | |
| p_logical_start | |
| v1: %29, v1: %30 = p_split_vector (kill)%26 | |
| s2: %4 = p_create_vector (kill)%23, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %45 = v_interp_p1_f32 %29, %24:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %30, %24:m0, (kill)%45 attr0.y | |
| v1: %46 = v_interp_p1_f32 (kill)%29, %24:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%30, (kill)%24:m0, (kill)%46 attr0.x | |
| v1: %48 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %47 = p_wqm (kill)%48 | |
| s2: %13, s1: (kill)%58:scc = s_andn2_b64 %0:exec, %56 | |
| v1: %15 = v_cndmask_b32 %47, 0x7f800000, (kill)%13 | |
| lv1: %54 = p_start_linear_vgpr | |
| lv1: %55 = p_start_linear_vgpr | |
| s1: %50, s2: (kill)%51, s1: (kill)%52:scc = p_reduce (kill)%15, (kill)%54, (kill)%55 op:fmin32 cluster_size:64 | |
| s1: %16 = p_wqm (kill)%50 | |
| v1: %17 = v_add_f32 (kill)%16, (kill)%47 | |
| v1: %19 = v_cvt_pkrtz_f16_f32 (kill)%17, 0 | |
| s2: %0:exec = p_parallelcopy (kill)%56 | |
| exp (kill)%19, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_mov_b64 s[0:1], exec ; be80047e | |
| s_wqm_b64 exec, s[0:1] ; befe0a00 | |
| s_mov_b32 s4, s3 ; be840303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040 | |
| s_mov_b32 m0, s4 ; befc0304 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002 | |
| s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_cndmask_b32_e64 v3, v0, 0x7f800000, s[2:3] ; d5010003 0009ff00 7f800000 | |
| s_or_saveexec_b64 s[4:5], -1 ; be8425c1 | |
| v_mov_b32_e32 v1, 0x7f800000 ; 7e0202ff 7f800000 | |
| v_cndmask_b32_e64 v1, v1, v3, s[4:5] ; d5010001 00120701 | |
| v_min_f32_dpp v1, v1, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff04b101 | |
| v_min_f32_dpp v1, v1, v1 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff044e01 | |
| v_min_f32_dpp v1, v1, v1 row_half_mirror row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff054101 | |
| v_min_f32_dpp v1, v1, v1 row_mirror row_mask:0xf bank_mask:0xf fi:1 ; 1e0202fa ff054001 | |
| v_permlanex16_b32 v2, v1, 0, 0 ; d7780002 02010101 | |
| v_min_f32_e32 v1, v1, v2 ; 1e020501 | |
| v_readlane_b32 s2, v1, 0 ; d7600002 00010101 | |
| v_min_f32_e32 v1, s2, v1 ; 1e020202 | |
| s_mov_b64 exec, s[4:5] ; befe0404 | |
| v_readlane_b32 s2, v1, 63 ; d7600002 00017f01 | |
| v_add_f32_e32 v0, s2, v0 ; 06000002 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 14036544613523326444 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 208 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 1465341766 | |
| Instructions (Instruction count): 35 | |
| Copies (Copy instructions created for pseudo-instructions): 5 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 462 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 36 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 15 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 4 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Building WaveIsFirstLane | |
| == With helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0xb38253d4, 0x2a6b4312, 0xb4b4a853, 0xac7cbb67, 0xc10fe263} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| num_images: 1 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_int: 0x20 | |
| separate_shader: true | |
| writes_memory: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var image INTERP_MODE_NONE restrict writeonly r32_uint uimageBuffer @1 (~0, 0, 2) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @2 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@3 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 1 div ssa_12 = intrinsic elect () () | |
| /* succs: block_1 block_2 */ | |
| if ssa_12 { | |
| block block_1: | |
| /* preds: block_0 */ | |
| vec1 32 con ssa_13 = load_const (0x00000050 = 0.000000) | |
| vec4 32 con ssa_14 = intrinsic load_smem_amd (ssa_3, ssa_13) (align_mul=16, align_offset=0) | |
| vec1 32 con ssa_15 = undefined | |
| vec4 32 con ssa_16 = vec4 ssa_4, ssa_15, ssa_15, ssa_15 | |
| vec1 32 con ssa_17 = undefined | |
| vec1 32 con ssa_18 = load_const (0x000003e8 = 0.000000) | |
| intrinsic bindless_image_store (ssa_14, ssa_16, ssa_17, ssa_18, ssa_4) (image_dim=Buf, image_array=false, format=r32_uint, access=10, src_type=uint32) | |
| /* succs: block_3 */ | |
| } else { | |
| block block_2: | |
| /* preds: block_0 */ | |
| /* succs: block_3 */ | |
| } | |
| block block_3: | |
| /* preds: block_1 block_2 */ | |
| vec1 32 con ssa_19 = undefined | |
| vec1 32 div ssa_20 = pack_half_2x16_split ssa_11.x, ssa_19 | |
| vec1 32 con ssa_21 = undefined | |
| vec4 32 div ssa_22 = vec4 ssa_20, ssa_21, ssa_21, ssa_21 | |
| intrinsic export_amd (ssa_22) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_4 */ | |
| block block_4: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: top-level, branch, needs_lowering, */ | |
| s2: %24:s[0-1], s1: %25:s[2], s1: %26:s[3], s1: %27:s[4], v2: %28:v[0-1] = p_startpgm | |
| s2: (kill)%30, s1: (kill)%29:scc = p_init_scratch (kill)%24, (latekill)(kill)%27 | |
| s2: %65 = p_parallelcopy %0:exec | |
| s2: %0:exec, s1: (kill)%66:scc = s_wqm_b64 %65 | |
| p_logical_start | |
| v1: %31, v1: %32 = p_split_vector (kill)%28 | |
| s2: %4 = p_create_vector (kill)%25, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 %4, 64 | |
| v1: %47 = v_interp_p1_f32 %31, %26:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %32, %26:m0, (kill)%47 attr0.y | |
| v1: %48 = v_interp_p1_f32 (kill)%31, %26:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%32, (kill)%26:m0, (kill)%48 attr0.x | |
| v1: %50 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %49 = p_wqm (kill)%50 | |
| s1: %67 = s_ff1_i32_b64 %0:exec | |
| s2: %51, s1: (kill)%68:scc = s_lshl_b64 1, (kill)%67 | |
| s2: %13 = p_wqm (kill)%51 | |
| p_logical_end | |
| s2: %0:exec = p_parallelcopy (kill)%65 | |
| s2: %70, s1: (kill)%69:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%13, %0:exec | |
| s2: (kill)%71 = p_cbranch_z %0:exec BB2, BB1 | |
| BB1 | |
| /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */ | |
| p_logical_start | |
| s4: %15 = s_load_dwordx4 (kill)%4, 0x50 | |
| v1: %57 = p_parallelcopy 0x3e8 | |
| v1: %58 = p_parallelcopy 0 | |
| buffer_store_format_x (kill)%15, (kill)%58, 0, (kill)%57 idxen disable_wqm storage:image | |
| p_logical_end | |
| s2: (kill)%59 = p_branch BB3 | |
| BB2 | |
| /* logical preds: / linear preds: BB0, / kind: uniform, */ | |
| s2: (kill)%60 = p_branch BB3 | |
| BB3 | |
| /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ | |
| s2: %0:exec, s1: (kill)%72:scc = s_andn2_b64 %70, %0:exec | |
| s2: (kill)%73 = p_cbranch_z %0:exec BB5, BB4 | |
| BB4 | |
| /* logical preds: BB0, / linear preds: BB3, / kind: uniform, */ | |
| p_logical_start | |
| p_logical_end | |
| s2: (kill)%62 = p_branch BB6 | |
| BB5 | |
| /* logical preds: / linear preds: BB3, / kind: uniform, */ | |
| s2: (kill)%63 = p_branch BB6 | |
| BB6 | |
| /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, export_end, */ | |
| s2: %0:exec = p_parallelcopy (kill)%70 | |
| p_logical_start | |
| v1: %21 = v_cvt_pkrtz_f16_f32 (kill)%49, 0 | |
| exp (kill)%21, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_mov_b64 s[0:1], exec ; be80047e | |
| s_wqm_b64 exec, s[0:1] ; befe0a00 | |
| s_mov_b32 s4, s3 ; be840303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040 | |
| s_mov_b32 m0, s4 ; befc0304 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002 | |
| s_ff1_i32_b64 s4, exec ; be84147e | |
| s_lshl_b64 s[4:5], 1, s4 ; 8f840481 | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| s_and_saveexec_b64 s[0:1], s[4:5] ; be802404 | |
| s_cbranch_execz BB6 ; bf880008 | |
| BB1: | |
| s_load_dwordx4 s[4:7], s[2:3], 0x50 ; f4080101 fa000050 | |
| v_mov_b32_e32 v1, 0x3e8 ; 7e0202ff 000003e8 | |
| v_mov_b32_e32 v2, 0 ; 7e040280 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| buffer_store_format_x v1, v2, s[4:7], 0 idxen ; e0102000 80010102 | |
| BB6: | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 10877422663015167528 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 152 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 1449320740 | |
| Instructions (Instruction count): 29 | |
| Copies (Copy instructions created for pseudo-instructions): 8 | |
| Branches (Branch instructions): 1 | |
| Latency (Issue cycles plus stall cycles): 413 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 26 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 2 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 2 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 17 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 | |
| == Without helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0x3293d9f2, 0x7f34506e, 0x445f4c15, 0x918f9d3e, 0x589000fa} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| num_images: 1 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| system_values_read: 0x00000000'00000000'08000000 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_int: 0x20 | |
| separate_shader: true | |
| writes_memory: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var image INTERP_MODE_NONE restrict writeonly r32_uint uimageBuffer @1 (~0, 0, 2) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @2 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@3 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 1 div ssa_12 = intrinsic load_helper_invocation () () | |
| /* succs: block_1 block_2 */ | |
| if ssa_12 { | |
| block block_1: | |
| /* preds: block_0 */ | |
| vec1 1 con ssa_13 = load_const (false) | |
| /* succs: block_3 */ | |
| } else { | |
| block block_2: | |
| /* preds: block_0 */ | |
| vec1 1 div ssa_14 = intrinsic elect () () | |
| /* succs: block_3 */ | |
| } | |
| block block_3: | |
| /* preds: block_1 block_2 */ | |
| vec1 1 div ssa_15 = phi block_1: ssa_13, block_2: ssa_14 | |
| /* succs: block_4 block_5 */ | |
| if ssa_15 { | |
| block block_4: | |
| /* preds: block_3 */ | |
| vec1 32 con ssa_16 = load_const (0x00000050 = 0.000000) | |
| vec4 32 con ssa_17 = intrinsic load_smem_amd (ssa_3, ssa_16) (align_mul=16, align_offset=0) | |
| vec1 32 con ssa_18 = undefined | |
| vec4 32 con ssa_19 = vec4 ssa_4, ssa_18, ssa_18, ssa_18 | |
| vec1 32 con ssa_20 = undefined | |
| vec1 32 con ssa_21 = load_const (0x000003e8 = 0.000000) | |
| intrinsic bindless_image_store (ssa_17, ssa_19, ssa_20, ssa_21, ssa_4) (image_dim=Buf, image_array=false, format=r32_uint, access=10, src_type=uint32) | |
| /* succs: block_6 */ | |
| } else { | |
| block block_5: | |
| /* preds: block_3 */ | |
| /* succs: block_6 */ | |
| } | |
| block block_6: | |
| /* preds: block_4 block_5 */ | |
| vec1 32 con ssa_22 = undefined | |
| vec1 32 div ssa_23 = pack_half_2x16_split ssa_11.x, ssa_22 | |
| vec1 32 con ssa_24 = undefined | |
| vec4 32 div ssa_25 = vec4 ssa_23, ssa_24, ssa_24, ssa_24 | |
| intrinsic export_amd (ssa_25) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_7 */ | |
| block block_7: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: top-level, branch, needs_lowering, */ | |
| s2: %27:s[0-1], s1: %28:s[2], s1: %29:s[3], s1: %30:s[4], v2: %31:v[0-1] = p_startpgm | |
| s2: (kill)%33, s1: (kill)%32:scc = p_init_scratch (kill)%27, (latekill)(kill)%30 | |
| s2: %77 = p_parallelcopy %0:exec | |
| s2: %0:exec, s1: (kill)%78:scc = s_wqm_b64 %77 | |
| p_logical_start | |
| v1: %34, v1: %35 = p_split_vector (kill)%31 | |
| s2: %4 = p_create_vector (kill)%28, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 %4, 64 | |
| v1: %50 = v_interp_p1_f32 %34, %29:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %35, %29:m0, (kill)%50 attr0.y | |
| v1: %51 = v_interp_p1_f32 (kill)%34, %29:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%35, (kill)%29:m0, (kill)%51 attr0.x | |
| v1: %53 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %52 = p_wqm (kill)%53 | |
| s2: %13, s1: (kill)%79:scc = s_andn2_b64 %0:exec, %77 | |
| p_logical_end | |
| s2: %81, s1: (kill)%80:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%13, %0:exec | |
| s2: (kill)%82 = p_cbranch_z %0:exec BB2, BB1 | |
| BB1 | |
| /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */ | |
| p_logical_start | |
| p_logical_end | |
| s2: (kill)%55 = p_branch BB3 | |
| BB2 | |
| /* logical preds: / linear preds: BB0, / kind: uniform, */ | |
| s2: (kill)%56 = p_branch BB3 | |
| BB3 | |
| /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ | |
| s2: %75 = p_linear_phi 0, s2: undef | |
| s2: %0:exec, s1: (kill)%83:scc = s_andn2_b64 (kill)%81, %0:exec | |
| s2: (kill)%84 = p_cbranch_z %0:exec BB5, BB4 | |
| BB4 | |
| /* logical preds: BB0, / linear preds: BB3, / kind: uniform, needs_lowering, */ | |
| p_logical_start | |
| s1: %85 = s_ff1_i32_b64 %0:exec | |
| s2: %58, s1: (kill)%86:scc = s_lshl_b64 1, (kill)%85 | |
| s2: %15 = p_wqm (kill)%58 | |
| s2: %74, s1: (kill)%76:scc = s_and_b64 (kill)%15, %0:exec | |
| p_logical_end | |
| s2: (kill)%59 = p_branch BB6 | |
| BB5 | |
| /* logical preds: / linear preds: BB3, / kind: uniform, */ | |
| s2: (kill)%60 = p_branch BB6 | |
| BB6 | |
| /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: top-level, branch, merge, */ | |
| s2: %16 = p_linear_phi (kill)%74, (kill)%75 | |
| s2: %0:exec = p_parallelcopy (kill)%77 | |
| p_logical_start | |
| p_logical_end | |
| s2: %88, s1: (kill)%87:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%16, %0:exec | |
| s2: (kill)%89 = p_cbranch_z %0:exec BB8, BB7 | |
| BB7 | |
| /* logical preds: BB6, / linear preds: BB6, / kind: uniform, */ | |
| p_logical_start | |
| s4: %18 = s_load_dwordx4 (kill)%4, 0x50 | |
| v1: %66 = p_parallelcopy 0x3e8 | |
| v1: %67 = p_parallelcopy 0 | |
| buffer_store_format_x (kill)%18, (kill)%67, 0, (kill)%66 idxen disable_wqm storage:image | |
| p_logical_end | |
| s2: (kill)%68 = p_branch BB9 | |
| BB8 | |
| /* logical preds: / linear preds: BB6, / kind: uniform, */ | |
| s2: (kill)%69 = p_branch BB9 | |
| BB9 | |
| /* logical preds: / linear preds: BB7, BB8, / kind: invert, */ | |
| s2: %0:exec, s1: (kill)%90:scc = s_andn2_b64 %88, %0:exec | |
| s2: (kill)%91 = p_cbranch_z %0:exec BB11, BB10 | |
| BB10 | |
| /* logical preds: BB6, / linear preds: BB9, / kind: uniform, */ | |
| p_logical_start | |
| p_logical_end | |
| s2: (kill)%71 = p_branch BB12 | |
| BB11 | |
| /* logical preds: / linear preds: BB9, / kind: uniform, */ | |
| s2: (kill)%72 = p_branch BB12 | |
| BB12 | |
| /* logical preds: BB7, BB10, / linear preds: BB10, BB11, / kind: uniform, top-level, merge, export_end, */ | |
| s2: %0:exec = p_parallelcopy (kill)%88 | |
| p_logical_start | |
| v1: %24 = v_cvt_pkrtz_f16_f32 (kill)%52, 0 | |
| exp (kill)%24, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_mov_b64 s[0:1], exec ; be80047e | |
| s_wqm_b64 exec, s[0:1] ; befe0a00 | |
| s_mov_b32 s4, s3 ; be840303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040 | |
| s_mov_b32 m0, s4 ; befc0304 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002 | |
| s_andn2_b64 s[4:5], exec, s[0:1] ; 8a84007e | |
| s_and_saveexec_b64 s[4:5], s[4:5] ; be842404 | |
| BB1: | |
| s_mov_b64 s[6:7], 0 ; be860480 | |
| BB3: | |
| s_andn2_b64 exec, s[4:5], exec ; 8afe7e04 | |
| BB4: | |
| s_ff1_i32_b64 s4, exec ; be84147e | |
| s_lshl_b64 s[4:5], 1, s4 ; 8f840481 | |
| s_and_b64 s[6:7], s[4:5], exec ; 87867e04 | |
| BB6: | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| s_and_saveexec_b64 s[0:1], s[6:7] ; be802406 | |
| s_cbranch_execz BB12 ; bf880008 | |
| BB7: | |
| s_load_dwordx4 s[4:7], s[2:3], 0x50 ; f4080101 fa000050 | |
| v_mov_b32_e32 v1, 0x3e8 ; 7e0202ff 000003e8 | |
| v_mov_b32_e32 v2, 0 ; 7e040280 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| buffer_store_format_x v1, v2, s[4:7], 0 idxen ; e0102000 80010102 | |
| BB12: | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 10816365857869393798 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 172 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 3977462287 | |
| Instructions (Instruction count): 34 | |
| Copies (Copy instructions created for pseudo-instructions): 9 | |
| Branches (Branch instructions): 1 | |
| Latency (Issue cycles plus stall cycles): 418 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 26 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 2 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 2 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 17 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Building WavePrefixProduct | |
| == With helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0x0baabb81, 0x192bff95, 0x98e9ff9b, 0x32e96625, 0x18f79b31} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x20 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 32 div ssa_12 = mov ssa_11.x | |
| vec1 32 div ssa_13 = intrinsic exclusive_scan (ssa_12) (reduction_op=fmul) | |
| vec1 32 div ssa_14 = fadd ssa_11.x, ssa_13 | |
| vec1 32 con ssa_15 = undefined | |
| vec1 32 div ssa_16 = pack_half_2x16_split ssa_14, ssa_15 | |
| vec1 32 con ssa_17 = undefined | |
| vec4 32 div ssa_18 = vec4 ssa_16, ssa_17, ssa_17, ssa_17 | |
| intrinsic export_amd (ssa_18) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_1 */ | |
| block block_1: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: uniform, top-level, export_end, */ | |
| s2: %20:s[0-1], s1: %21:s[2], s1: %22:s[3], s1: %23:s[4], v2: %24:v[0-1] = p_startpgm | |
| s2: (kill)%26, s1: (kill)%25:scc = p_init_scratch (kill)%20, (latekill)(kill)%23 | |
| s2: %0:exec, s1: (kill)%54:scc = s_wqm_b64 %0:exec | |
| p_logical_start | |
| v1: %27, v1: %28 = p_split_vector (kill)%24 | |
| s2: %4 = p_create_vector (kill)%21, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %43 = v_interp_p1_f32 %27, %22:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %28, %22:m0, (kill)%43 attr0.y | |
| v1: %44 = v_interp_p1_f32 (kill)%27, %22:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%28, (kill)%22:m0, (kill)%44 attr0.x | |
| v1: %46 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %45 = p_wqm (kill)%46 | |
| lv1: %52 = p_start_linear_vgpr | |
| lv1: %53 = p_start_linear_vgpr | |
| v1: %47, s2: (kill)%48, s1: (kill)%49, s1: (kill)%50:scc = p_exclusive_scan %45, (kill)%52, (kill)%53 op:fmul32 cluster_size:64 | |
| v1: %14 = p_wqm (kill)%47 | |
| v1: %15 = v_add_f32 (kill)%45, (kill)%14 | |
| v1: %17 = v_cvt_pkrtz_f16_f32 (kill)%15, 0 | |
| exp (kill)%17, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_wqm_b64 exec, exec ; befe0a7e | |
| s_mov_b32 s0, s3 ; be800303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040 | |
| s_mov_b32 m0, s0 ; befc0300 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002 | |
| s_or_saveexec_b64 s[0:1], -1 ; be8025c1 | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_cndmask_b32_e64 v1, 1.0, v0, s[0:1] ; d5010001 000200f2 | |
| v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; 7e0402fa ff0d1101 | |
| s_mov_b32 exec_lo, 0x10000 ; befe03ff 00010000 | |
| s_mov_b32 exec_hi, 0x10000 ; beff03ff 00010000 | |
| v_permlanex16_b32 v2, v1, -1, -1 op_sel:[1,0] ; d7780802 03058301 | |
| s_mov_b64 exec, -1 ; befe04c1 | |
| v_readlane_b32 s2, v1, 31 ; d7600002 00013f01 | |
| v_writelane_b32 v2, s2, 32 ; d7610002 00014002 | |
| v_writelane_b32 v2, 1.0, 0 ; d7610002 000100f2 | |
| v_mul_f32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051102 | |
| v_mul_f32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051202 | |
| v_mul_f32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051402 | |
| v_mul_f32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051802 | |
| s_bfm_b32 exec_lo, 16, 16 ; 927e9090 | |
| s_bfm_b32 exec_hi, 16, 16 ; 927f9090 | |
| v_permlanex16_b32 v1, v2, -1, -1 op_sel:[1,0] ; d7780801 03058302 | |
| v_mul_f32_e32 v2, v2, v1 ; 10040302 | |
| s_bfm_b64 exec, 32, 32 ; 92fea0a0 | |
| v_readlane_b32 s2, v2, 31 ; d7600002 00013f02 | |
| v_mul_f32_e32 v2, s2, v2 ; 10040402 | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| v_mov_b32_e32 v1, v2 ; 7e020302 | |
| v_add_f32_e32 v0, v0, v1 ; 06000300 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 13476569651227853137 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 244 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 4082983596 | |
| Instructions (Instruction count): 41 | |
| Copies (Copy instructions created for pseudo-instructions): 3 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 476 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 42 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 13 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 | |
| == Without helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0xa73cb651, 0x947d7cfc, 0xbb461f1d, 0xf1ea3766, 0xdd013638} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| system_values_read: 0x00000000'00000000'08000000 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x21 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 1 div ssa_12 = intrinsic load_helper_invocation () () | |
| vec1 32 con ssa_13 = load_const (0x3f800000 = 1.000000) | |
| vec1 32 div ssa_14 = bcsel ssa_12, ssa_13, ssa_11.x | |
| vec1 32 div ssa_15 = intrinsic exclusive_scan (ssa_14) (reduction_op=fmul) | |
| vec1 32 div ssa_16 = fadd ssa_11.x, ssa_15 | |
| vec1 32 con ssa_17 = undefined | |
| vec1 32 div ssa_18 = pack_half_2x16_split ssa_16, ssa_17 | |
| vec1 32 con ssa_19 = undefined | |
| vec4 32 div ssa_20 = vec4 ssa_18, ssa_19, ssa_19, ssa_19 | |
| intrinsic export_amd (ssa_20) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_1 */ | |
| block block_1: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: uniform, top-level, needs_lowering, export_end, */ | |
| s2: %22:s[0-1], s1: %23:s[2], s1: %24:s[3], s1: %25:s[4], v2: %26:v[0-1] = p_startpgm | |
| s2: (kill)%28, s1: (kill)%27:scc = p_init_scratch (kill)%22, (latekill)(kill)%25 | |
| s2: %57 = p_parallelcopy %0:exec | |
| s2: %0:exec, s1: (kill)%58:scc = s_wqm_b64 %57 | |
| p_logical_start | |
| v1: %29, v1: %30 = p_split_vector (kill)%26 | |
| s2: %4 = p_create_vector (kill)%23, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %45 = v_interp_p1_f32 %29, %24:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %30, %24:m0, (kill)%45 attr0.y | |
| v1: %46 = v_interp_p1_f32 (kill)%29, %24:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%30, (kill)%24:m0, (kill)%46 attr0.x | |
| v1: %48 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %47 = p_wqm (kill)%48 | |
| s2: %13, s1: (kill)%59:scc = s_andn2_b64 %0:exec, %57 | |
| v1: %15 = v_cndmask_b32 %47, 1.0, (kill)%13 | |
| lv1: %55 = p_start_linear_vgpr | |
| lv1: %56 = p_start_linear_vgpr | |
| v1: %50, s2: (kill)%51, s1: (kill)%52, s1: (kill)%53:scc = p_exclusive_scan (kill)%15, (kill)%55, (kill)%56 op:fmul32 cluster_size:64 | |
| v1: %16 = p_wqm (kill)%50 | |
| v1: %17 = v_add_f32 (kill)%47, (kill)%16 | |
| v1: %19 = v_cvt_pkrtz_f16_f32 (kill)%17, 0 | |
| s2: %0:exec = p_parallelcopy (kill)%57 | |
| exp (kill)%19, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_mov_b64 s[0:1], exec ; be80047e | |
| s_wqm_b64 exec, s[0:1] ; befe0a00 | |
| s_mov_b32 s4, s3 ; be840303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040 | |
| s_mov_b32 m0, s4 ; befc0304 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002 | |
| s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_cndmask_b32_e64 v3, v0, 1.0, s[2:3] ; d5010003 0009e500 | |
| s_or_saveexec_b64 s[2:3], -1 ; be8225c1 | |
| v_cndmask_b32_e64 v1, 1.0, v3, s[2:3] ; d5010001 000a06f2 | |
| v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; 7e0402fa ff0d1101 | |
| s_mov_b32 exec_lo, 0x10000 ; befe03ff 00010000 | |
| s_mov_b32 exec_hi, 0x10000 ; beff03ff 00010000 | |
| v_permlanex16_b32 v2, v1, -1, -1 op_sel:[1,0] ; d7780802 03058301 | |
| s_mov_b64 exec, -1 ; befe04c1 | |
| v_readlane_b32 s4, v1, 31 ; d7600004 00013f01 | |
| v_writelane_b32 v2, s4, 32 ; d7610002 00014004 | |
| v_writelane_b32 v2, 1.0, 0 ; d7610002 000100f2 | |
| v_mul_f32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051102 | |
| v_mul_f32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051202 | |
| v_mul_f32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051402 | |
| v_mul_f32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf fi:1 ; 100404fa ff051802 | |
| s_bfm_b32 exec_lo, 16, 16 ; 927e9090 | |
| s_bfm_b32 exec_hi, 16, 16 ; 927f9090 | |
| v_permlanex16_b32 v1, v2, -1, -1 op_sel:[1,0] ; d7780801 03058302 | |
| v_mul_f32_e32 v2, v2, v1 ; 10040302 | |
| s_bfm_b64 exec, 32, 32 ; 92fea0a0 | |
| v_readlane_b32 s4, v2, 31 ; d7600004 00013f02 | |
| v_mul_f32_e32 v2, s4, v2 ; 10040404 | |
| s_mov_b64 exec, s[2:3] ; befe0402 | |
| v_mov_b32_e32 v1, v2 ; 7e020302 | |
| v_add_f32_e32 v0, v0, v1 ; 06000300 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 14392008431707320378 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 264 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 2093754719 | |
| Instructions (Instruction count): 45 | |
| Copies (Copy instructions created for pseudo-instructions): 5 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 484 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 44 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 15 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 4 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Building WaveReadLaneFirst | |
| == With helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0x4941eebc, 0x7b751fe9, 0x24b16f74, 0x2275d816, 0xfafc5351} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x20 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 32 div ssa_12 = mov ssa_11.x | |
| vec1 32 con ssa_13 = intrinsic read_first_invocation (ssa_12) () | |
| vec1 32 div ssa_14 = fadd ssa_11.x, ssa_13 | |
| vec1 32 con ssa_15 = undefined | |
| vec1 32 div ssa_16 = pack_half_2x16_split ssa_14, ssa_15 | |
| vec1 32 con ssa_17 = undefined | |
| vec4 32 div ssa_18 = vec4 ssa_16, ssa_17, ssa_17, ssa_17 | |
| intrinsic export_amd (ssa_18) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_1 */ | |
| block block_1: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: uniform, top-level, export_end, */ | |
| s2: %20:s[0-1], s1: %21:s[2], s1: %22:s[3], s1: %23:s[4], v2: %24:v[0-1] = p_startpgm | |
| s2: (kill)%26, s1: (kill)%25:scc = p_init_scratch (kill)%20, (latekill)(kill)%23 | |
| s2: %0:exec, s1: (kill)%49:scc = s_wqm_b64 %0:exec | |
| p_logical_start | |
| v1: %27, v1: %28 = p_split_vector (kill)%24 | |
| s2: %4 = p_create_vector (kill)%21, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %43 = v_interp_p1_f32 %27, %22:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %28, %22:m0, (kill)%43 attr0.y | |
| v1: %44 = v_interp_p1_f32 (kill)%27, %22:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%28, (kill)%22:m0, (kill)%44 attr0.x | |
| v1: %46 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %45 = p_wqm (kill)%46 | |
| s1: %47 = v_readfirstlane_b32 %45 | |
| s1: %14 = p_wqm (kill)%47 | |
| v1: %15 = v_add_f32 (kill)%14, (kill)%45 | |
| v1: %17 = v_cvt_pkrtz_f16_f32 (kill)%15, 0 | |
| exp (kill)%17, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_wqm_b64 exec, exec ; befe0a7e | |
| s_mov_b32 s0, s3 ; be800303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[4:7], s[2:3], 0x40 ; f4080101 fa000040 | |
| s_mov_b32 m0, s0 ; befc0300 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00220000 00000002 | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_readfirstlane_b32 s0, v0 ; 7e000500 | |
| v_add_f32_e32 v0, s0, v0 ; 06000000 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 9810138321964705697 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 100 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 1513780926 | |
| Instructions (Instruction count): 19 | |
| Copies (Copy instructions created for pseudo-instructions): 3 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 397 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 25 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 13 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 | |
| == Without helper lanes == | |
| Representation: NIR Shader(s) (The optimized NIR shader(s)) | |
| shader: MESA_SHADER_FRAGMENT | |
| source_sha1: {0x7549a71c, 0x220294d6, 0xcdfed24c, 0x4c8bf6ea, 0x99e0b609} | |
| stage: 4 | |
| next_stage: 0 | |
| num_textures: 2 | |
| inputs_read: 32 | |
| outputs_written: 4 | |
| system_values_read: 0x00000000'00000000'08000000 | |
| subgroup_size: 0 | |
| uses_wide_subgroup_intrinsics: true | |
| divergence_analysis_run: true | |
| bit_sizes_float: 0x20 | |
| bit_sizes_int: 0x20 | |
| separate_shader: true | |
| needs_quad_helper_invocations: true | |
| needs_all_helper_invocations: true | |
| origin_upper_left: true | |
| inputs: 1 | |
| outputs: 0 | |
| uniforms: 0 | |
| decl_var uniform INTERP_MODE_NONE restrict texture2D @0 (~0, 0, 0) | |
| decl_var uniform INTERP_MODE_NONE restrict sampler @1 (~0, 0, 1) | |
| decl_var shader_out INTERP_MODE_NONE float SV_Target (FRAG_RESULT_DATA0.x, 4, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD (VARYING_SLOT_VAR0.x, 0, 0) | |
| decl_var shader_in INTERP_MODE_NONE float TEXCOORD@2 (VARYING_SLOT_VAR0.y, 0, 0) | |
| decl_function main (0 params) | |
| impl main { | |
| block block_0: | |
| /* preds: */ | |
| vec2 32 div ssa_0 = intrinsic load_barycentric_pixel () (interp_mode=0) | |
| vec1 32 con ssa_1 = intrinsic load_scalar_arg_amd () (base=1, arg_upper_bound_u32_amd=0) | |
| vec1 32 con ssa_2 = load_const (0xffff8000 = -nan) | |
| vec1 64 con ssa_3 = pack_64_2x32_split ssa_1, ssa_2 | |
| vec1 32 con ssa_4 = load_const (0x00000000 = 0.000000) | |
| vec8 32 con ssa_5 = intrinsic load_smem_amd (ssa_3, ssa_4) (align_mul=32, align_offset=0) | |
| vec1 32 con ssa_6 = load_const (0x00000040 = 0.000000) | |
| vec4 32 con ssa_7 = intrinsic load_smem_amd (ssa_3, ssa_6) (align_mul=16, align_offset=0) | |
| vec1 32 div ssa_8 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=1, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec1 32 div ssa_9 = intrinsic load_interpolated_input (ssa_0, ssa_4) (base=0, component=0, dest_type=float32, io location=VARYING_SLOT_VAR0 slots=1) /* TEXCOORD */ | |
| vec2 32 div ssa_10 = vec2 ssa_9, ssa_8 | |
| vec4 32 div ssa_11 = (float32)tex ssa_5 (texture_handle), ssa_7 (sampler_handle), ssa_10 (coord), 0 (texture), 0 (sampler) | |
| vec1 1 div ssa_12 = intrinsic load_helper_invocation () () | |
| /* succs: block_1 block_2 */ | |
| if ssa_12 { | |
| block block_1: | |
| /* preds: block_0 */ | |
| vec1 32 con ssa_13 = undefined | |
| /* succs: block_3 */ | |
| } else { | |
| block block_2: | |
| /* preds: block_0 */ | |
| vec1 32 div ssa_14 = mov ssa_11.x | |
| vec1 32 con ssa_15 = intrinsic read_first_invocation (ssa_14) () | |
| /* succs: block_3 */ | |
| } | |
| block block_3: | |
| /* preds: block_1 block_2 */ | |
| vec1 32 con ssa_16 = phi block_1: ssa_13, block_2: ssa_15 | |
| vec1 32 div ssa_17 = fadd ssa_11.x, ssa_16 | |
| vec1 32 con ssa_18 = undefined | |
| vec1 32 div ssa_19 = pack_half_2x16_split ssa_17, ssa_18 | |
| vec1 32 con ssa_20 = undefined | |
| vec4 32 div ssa_21 = vec4 ssa_19, ssa_20, ssa_20, ssa_20 | |
| intrinsic export_amd (ssa_21) (base=0, wrmask=xy, flags=7) | |
| /* succs: block_4 */ | |
| block block_4: | |
| } | |
| Representation: ACO IR (The ACO IR after some optimizations) | |
| After Spilling: | |
| ACO shader stage: fragment_fs | |
| BB0 | |
| /* logical preds: / linear preds: / kind: top-level, branch, needs_lowering, */ | |
| s2: %23:s[0-1], s1: %24:s[2], s1: %25:s[3], s1: %26:s[4], v2: %27:v[0-1] = p_startpgm | |
| s2: (kill)%29, s1: (kill)%28:scc = p_init_scratch (kill)%23, (latekill)(kill)%26 | |
| s2: %58 = p_parallelcopy %0:exec | |
| s2: %0:exec, s1: (kill)%59:scc = s_wqm_b64 %58 | |
| p_logical_start | |
| v1: %30, v1: %31 = p_split_vector (kill)%27 | |
| s2: %4 = p_create_vector (kill)%24, 0xffff8000 | |
| s8: %6 = s_load_dwordx8 %4, 0 | |
| s4: %8 = s_load_dwordx4 (kill)%4, 64 | |
| v1: %46 = v_interp_p1_f32 %30, %25:m0 attr0.y | |
| v1: %9 = v_interp_p2_f32 %31, %25:m0, (kill)%46 attr0.y | |
| v1: %47 = v_interp_p1_f32 (kill)%30, %25:m0 attr0.x | |
| v1: %10 = v_interp_p2_f32 (kill)%31, (kill)%25:m0, (kill)%47 attr0.x | |
| v1: %49 = image_sample (kill)%6, (kill)%8, v1: undef, (kill)%10, (kill)%9 2d | |
| v1: %48 = p_wqm (kill)%49 | |
| s2: %13, s1: (kill)%60:scc = s_andn2_b64 %0:exec, %58 | |
| p_logical_end | |
| s2: %62, s1: (kill)%61:scc, s2: %0:exec = s_and_saveexec_b64 (kill)%13, %0:exec | |
| s2: (kill)%63 = p_cbranch_z %0:exec BB2, BB1 | |
| BB1 | |
| /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */ | |
| p_logical_start | |
| p_logical_end | |
| s2: (kill)%51 = p_branch BB3 | |
| BB2 | |
| /* logical preds: / linear preds: BB0, / kind: uniform, */ | |
| s2: (kill)%52 = p_branch BB3 | |
| BB3 | |
| /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ | |
| s2: %0:exec, s1: (kill)%64:scc = s_andn2_b64 (kill)%62, %0:exec | |
| s2: (kill)%65 = p_cbranch_z %0:exec BB5, BB4 | |
| BB4 | |
| /* logical preds: BB0, / linear preds: BB3, / kind: uniform, */ | |
| p_logical_start | |
| s1: %54 = v_readfirstlane_b32 %48 | |
| s1: %16 = p_wqm (kill)%54 | |
| p_logical_end | |
| s2: (kill)%55 = p_branch BB6 | |
| BB5 | |
| /* logical preds: / linear preds: BB3, / kind: uniform, */ | |
| s2: (kill)%56 = p_branch BB6 | |
| BB6 | |
| /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, export_end, */ | |
| s1: %17 = p_linear_phi (kill)%16, s1: undef | |
| s2: %0:exec = p_parallelcopy (kill)%58 | |
| p_logical_start | |
| v1: %18 = v_add_f32 (kill)%17, (kill)%48 | |
| v1: %20 = v_cvt_pkrtz_f16_f32 (kill)%18, 0 | |
| exp (kill)%20, v1: undef, v1: undef, v1: undef en:rg** compr mrt0 | |
| p_logical_end | |
| s_endpgm | |
| Representation: Assembly (Final Assembly) | |
| BB0: | |
| s_mov_b64 s[0:1], exec ; be80047e | |
| s_wqm_b64 exec, s[0:1] ; befe0a00 | |
| s_mov_b32 s4, s3 ; be840303 | |
| s_movk_i32 s3, 0x8000 ; b0038000 | |
| s_clause 0x1 ; bfa10001 | |
| s_load_dwordx8 s[8:15], s[2:3], null ; f40c0201 fa000000 | |
| s_load_dwordx4 s[16:19], s[2:3], 0x40 ; f4080401 fa000040 | |
| s_mov_b32 m0, s4 ; befc0304 | |
| v_interp_p1_f32_e32 v2, v0, attr0.y ; c8080100 | |
| v_interp_p2_f32_e32 v2, v1, attr0.y ; c8090101 | |
| v_interp_p1_f32_e32 v0, v0, attr0.x ; c8000000 | |
| v_interp_p2_f32_e32 v0, v1, attr0.x ; c8010001 | |
| s_waitcnt lgkmcnt(0) ; bf8cc07f | |
| image_sample v0, [v0, v2], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f080010a 00820000 00000002 | |
| s_andn2_b64 s[2:3], exec, s[0:1] ; 8a82007e | |
| s_and_saveexec_b64 s[2:3], s[2:3] ; be822402 | |
| BB3: | |
| s_andn2_b64 exec, s[2:3], exec ; 8afe7e02 | |
| BB4: | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_readfirstlane_b32 s2, v0 ; 7e040500 | |
| BB6: | |
| s_mov_b64 exec, s[0:1] ; befe0400 | |
| s_waitcnt vmcnt(0) ; bf8c3f70 | |
| v_add_f32_e32 v0, s2, v0 ; 06000002 | |
| v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; d52f0000 00010100 | |
| exp mrt0 v0, v0, off, off done compr vm ; f8001c03 80808000 | |
| s_endpgm ; bf810000 | |
| Driver pipeline hash (Driver pipeline hash used by RGP): 16198948385471629174 | |
| SGPRs (Number of SGPR registers allocated per subgroup): 128 | |
| VGPRs (Number of VGPR registers allocated per subgroup): 8 | |
| Spilled SGPRs (Number of SGPR registers spilled per subgroup): 0 | |
| Spilled VGPRs (Number of VGPR registers spilled per subgroup): 0 | |
| Code size (Code size in bytes): 124 | |
| LDS size (LDS size in bytes per workgroup): 0 | |
| Scratch size (Private memory in bytes per subgroup): 0 | |
| Subgroups per SIMD (The maximum number of subgroups in flight on a SIMD unit): 32 | |
| Hash (CRC32 hash of code and constant data): 144664052 | |
| Instructions (Instruction count): 25 | |
| Copies (Copy instructions created for pseudo-instructions): 5 | |
| Branches (Branch instructions): 0 | |
| Latency (Issue cycles plus stall cycles): 632 | |
| Inverse Throughput (Estimated busy cycles to execute one wave): 39 | |
| VMEM Clause (Number of VMEM clauses (includes 1-sized clauses)): 1 | |
| SMEM Clause (Number of SMEM clauses (includes 1-sized clauses)): 1 | |
| Pre-Sched SGPRs (SGPR usage before scheduling): 15 | |
| Pre-Sched VGPRs (VGPR usage before scheduling): 3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment