Skip to content

Instantly share code, notes, and snippets.

@sohaibiftikhar
Created February 17, 2026 13:49
Show Gist options
  • Select an option

  • Save sohaibiftikhar/8922927e42f0e4ba3c126c42f2a96f85 to your computer and use it in GitHub Desktop.

Select an option

Save sohaibiftikhar/8922927e42f0e4ba3c126c42f2a96f85 to your computer and use it in GitHub Desktop.
Reproducer#LLVM#149706
; *** IR Dump After LoopVectorizePass on broadcast_add_fusion ***
; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, target_mem0: none, target_mem1: none) uwtable
define noalias noundef ptr @broadcast_add_fusion(ptr readonly captures(none) %0) local_unnamed_addr #0 {
%2 = getelementptr inbounds nuw i8, ptr %0, i64 24
%3 = load ptr, ptr %2, align 8, !invariant.load !3
%4 = load ptr, ptr %3, align 8, !invariant.load !3, !dereferenceable !4
%5 = getelementptr inbounds nuw i8, ptr %3, i64 16
%6 = load ptr, ptr %5, align 8, !invariant.load !3, !dereferenceable !5
%7 = getelementptr inbounds nuw i8, ptr %3, i64 32
%8 = load ptr, ptr %7, align 8, !invariant.load !3, !dereferenceable !4
tail call void @llvm.experimental.noalias.scope.decl(metadata !6)
tail call void @llvm.experimental.noalias.scope.decl(metadata !9)
tail call void @llvm.experimental.noalias.scope.decl(metadata !11)
%9 = load i64, ptr %6, align 4, !invariant.load !3, !alias.scope !9, !noalias !13
br label %vector.ph
vector.ph: ; preds = %1
%broadcast.splatinsert = insertelement <4 x i64> poison, i64 %9, i64 0
%broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.ph
%wide.load = load <4 x i64>, ptr %4, align 4
%10 = add <4 x i64> %wide.load, %broadcast.splat
store <4 x i64> %10, ptr %8, align 4
br label %middle.block
middle.block: ; preds = %vector.body
br label %broadcast_add_fusion_wrapped.exit
broadcast_add_fusion_wrapped.exit: ; preds = %middle.block
ret ptr null
}
; *** IR Dump After LoopVectorizePass on broadcast_add_fusion ***
; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, target_mem0: none, target_mem1: none) uwtable
define noalias noundef ptr @broadcast_add_fusion(ptr readonly captures(none) %0) local_unnamed_addr #0 {
%2 = getelementptr inbounds nuw i8, ptr %0, i64 24
%3 = load ptr, ptr %2, align 8, !invariant.load !3
%4 = load ptr, ptr %3, align 8, !invariant.load !3, !dereferenceable !4
%5 = getelementptr inbounds nuw i8, ptr %3, i64 16
%6 = load ptr, ptr %5, align 8, !invariant.load !3, !dereferenceable !5
%7 = getelementptr inbounds nuw i8, ptr %3, i64 32
%8 = load ptr, ptr %7, align 8, !invariant.load !3, !dereferenceable !4
tail call void @llvm.experimental.noalias.scope.decl(metadata !6)
tail call void @llvm.experimental.noalias.scope.decl(metadata !9)
tail call void @llvm.experimental.noalias.scope.decl(metadata !11)
%9 = load i64, ptr %6, align 4, !invariant.load !3, !alias.scope !9, !noalias !13
br label %vector.ph
vector.ph: ; preds = %1
%broadcast.splatinsert = insertelement <4 x i64> poison, i64 %9, i64 0
%broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.ph
%wide.vec = load <16 x i64>, ptr %4, align 4, !invariant.load !3, !alias.scope !6, !noalias !14
%strided.vec = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
%strided.vec2 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
%strided.vec3 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
%strided.vec4 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
%10 = add <4 x i64> %strided.vec, %broadcast.splat
%11 = add <4 x i64> %strided.vec2, %broadcast.splat
%12 = add <4 x i64> %strided.vec3, %broadcast.splat
%13 = add <4 x i64> %strided.vec4, %broadcast.splat
%14 = shufflevector <4 x i64> %10, <4 x i64> %11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%15 = shufflevector <4 x i64> %12, <4 x i64> %13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%16 = shufflevector <8 x i64> %14, <8 x i64> %15, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%interleaved.vec = shufflevector <16 x i64> %16, <16 x i64> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
store <16 x i64> %interleaved.vec, ptr %8, align 4, !alias.scope !11, !noalias !15
br label %middle.block
middle.block: ; preds = %vector.body
br label %broadcast_add_fusion_wrapped.exit
broadcast_add_fusion_wrapped.exit: ; preds = %middle.block
ret ptr null
}
; ModuleID = '__compute_module_broadcast_add_fusion_kernel_module'
source_filename = "__compute_module_broadcast_add_fusion_kernel_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
%XLA_CPU_KernelCallFrame = type { ptr, ptr, i64, ptr }
%XLA_CPU_KernelArg = type { ptr, i64 }
%kernel_dim3 = type { i64, i64, i64 }
; Function Attrs: uwtable
define ptr @broadcast_add_fusion(ptr %0) #0 {
%2 = getelementptr inbounds %XLA_CPU_KernelCallFrame, ptr %0, i32 0, i32 3
%3 = load ptr, ptr %2, align 8, !invariant.load !3
%4 = getelementptr inbounds %XLA_CPU_KernelArg, ptr %3, i32 0, i32 0
%5 = load ptr, ptr %4, align 8, !invariant.load !3, !dereferenceable !4
%6 = getelementptr inbounds %XLA_CPU_KernelArg, ptr %3, i32 1, i32 0
%7 = load ptr, ptr %6, align 8, !invariant.load !3, !dereferenceable !5
%8 = getelementptr inbounds %XLA_CPU_KernelArg, ptr %3, i32 2, i32 0
%9 = load ptr, ptr %8, align 8, !invariant.load !3, !dereferenceable !4
%10 = getelementptr inbounds %XLA_CPU_KernelCallFrame, ptr %0, i32 0, i32 1
%11 = load ptr, ptr %10, align 8
%12 = getelementptr inbounds %kernel_dim3, ptr %11, i32 0, i32 0
%13 = load i64, ptr %12, align 4, !invariant.load !3
%14 = getelementptr inbounds %kernel_dim3, ptr %11, i32 0, i32 1
%15 = load i64, ptr %14, align 4, !invariant.load !3
%16 = getelementptr inbounds %kernel_dim3, ptr %11, i32 0, i32 2
%17 = load i64, ptr %16, align 4, !invariant.load !3
call void @broadcast_add_fusion_wrapped(ptr %5, ptr %7, ptr %9, i64 %13, i64 %15, i64 %17)
ret ptr null
}
; Function Attrs: alwaysinline
define internal void @broadcast_add_fusion_wrapped(ptr noalias align 32 dereferenceable(128) %0, ptr noalias align 32 dereferenceable(8) %1, ptr noalias align 32 dereferenceable(128) %2, i64 %3, i64 %4, i64 %5) #1 {
%7 = getelementptr inbounds [1 x i64], ptr %1, i32 0, i32 0
%8 = load i64, ptr %7, align 4, !invariant.load !3
br label %9
9: ; preds = %24, %6
%10 = phi i64 [ %25, %24 ], [ 0, %6 ]
%11 = icmp slt i64 %10, 4
br i1 %11, label %12, label %26
12: ; preds = %9
%13 = mul nsw i64 %10, 4
br label %14
14: ; preds = %17, %12
%15 = phi i64 [ %23, %17 ], [ 0, %12 ]
%16 = icmp slt i64 %15, 4
br i1 %16, label %17, label %24
17: ; preds = %14
%18 = add nsw i64 %13, %15
%19 = getelementptr inbounds [16 x i64], ptr %0, i32 0, i64 %18
%20 = load i64, ptr %19, align 4, !invariant.load !3
%21 = add i64 %20, %8
%22 = getelementptr inbounds [16 x i64], ptr %2, i32 0, i64 %18
store i64 %21, ptr %22, align 4
%23 = add i64 %15, 1
br label %14
24: ; preds = %14
%25 = add i64 %10, 1
br label %9, !llvm.loop !6
26: ; preds = %9
ret void
}
attributes #0 = { uwtable "frame-pointer"="all" "prefer-vector-width"="256" }
attributes #1 = { alwaysinline }
!llvm.module.flags = !{!0, !1}
!xla_cpu_memory_region_name = !{!2}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 1, !"xla_dylib_index", i64 0}
!2 = !{!"xla_cpu_emitter__loop_fusion_kernel_emitter__hlo_opcode__fusion"}
!3 = !{}
!4 = !{i64 128}
!5 = !{i64 8}
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.unroll.disable"}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment