Keren Zhou Jokeren

## gemm.sass
      LDC R1, c[0x0][0x37c]
      S2R R2, SR_TID.X
      S2UR UR4, SR_CgaCtaId
      PMTRIG 0x4000
      ISETP.LT.AND P0, PT, R2, 0x80, PT
      NOP
      BRA !P0, `(.L_x_0) 0x7f46c19b1410
      R2UR UR6, R2
      UPRMT UR7, URZ, 0x4210, UR4
      NOP

## repro_tensor_metric_kernel.py
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import hashlib
import os
import shutil
import subprocess
import sys
import sysconfig

## README.md

      
              3 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                Jokeren
                / README.md
            
            
              Last active
              July 30, 2025 21:57
            
              
                Inline Info Reproducer
              
          
    Build triton

git clone https://github.com/triton-lang/triton.git && cd triton
pip install -e.

Run test.py


## wrong.llir
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target triple = "amdgcn-amd-amdhsa"

@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16

; Function Attrs: mustprogress nofree norecurse nounwind willreturn
define amdgpu_kernel void @flip_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 !dbg !4 {
  %3 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7
  %4 = shl i32 %3, 2, !dbg !8

## new.mlir
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1], hasLeadingOffset = false}>
#shared1 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} {
  tt.func public @hoist_convert_above_extf_and_remat(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.pt

## overhead.py
import torch
import time
import sys


def run(nelems, iters):
    # Check if CUDA is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tensor_a = torch.randn(nelems, dtype=torch.float32, device=device)

## gist:17554b58eed1157a43aaa85a5f1cfc7c
"""
Matrix Multiplication
=====================
In this tutorial, you will write a very short high-performance FP16 matrix multiplication kernel that achieves
performance on par with cuBLAS or rocBLAS.

You will specifically learn about:

* Block-level matrix multiplications.

## add.py
import torch
import sys

device = torch.device('cpu')
left = torch.zeros(100, device=device, requires_grad=True)
right = torch.zeros(100, device=device, requires_grad=True)
grad = torch.zeros(100, device=device)

for _ in range(10):
    output = torch.add(left, right)

## test.py
import triton
import pytest
import torch
import triton.language as tl
import numpy as np
from numpy.random import RandomState


@pytest.mark.parametrize("M, N, K, num_warps, epilogue, allow_tf32, in_dtype, out_dtype, axis",
                         [(*shape_nw, 'softmax', allow_tf32, in_dtype, out_dtype, axis)

## main.ptx
//
// Generated by LLVM NVPTX Back-End
//

.version 8.0
.target sm_80
.address_size 64

	// .globl	triton__0d1d2d3d4d56d7d89d1011d1213d1415d1617d1819d2021d2223d2425d2627d2829d3031d3233d3435d3637d3839d4041d42d
.extern .shared .align 1 .b8 global_smem[];
	LDC R1, c[0x0][0x37c]
	S2R R2, SR_TID.X
	S2UR UR4, SR_CgaCtaId
	PMTRIG 0x4000
	ISETP.LT.AND P0, PT, R2, 0x80, PT
	NOP
	BRA !P0, `(.L_x_0) 0x7f46c19b1410
	R2UR UR6, R2
	UPRMT UR7, URZ, 0x4210, UR4
	NOP
	#!/usr/bin/env python3
	from __future__ import annotations

	import argparse
	import hashlib
	import os
	import shutil
	import subprocess
	import sys
	import sysconfig
	; ModuleID = 'LLVMDialectModule'
	source_filename = "LLVMDialectModule"
	target triple = "amdgcn-amd-amdhsa"

	@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16

	; Function Attrs: mustprogress nofree norecurse nounwind willreturn
	define amdgpu_kernel void @flip_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 !dbg !4 {
	%3 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7
	%4 = shl i32 %3, 2, !dbg !8
	#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
	#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
	#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
	#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
	#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1], hasLeadingOffset = false}>
	#shared1 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
	module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} {
	tt.func public @hoist_convert_above_extf_and_remat(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.pt
	import torch
	import time
	import sys


	def run(nelems, iters):
	# Check if CUDA is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	tensor_a = torch.randn(nelems, dtype=torch.float32, device=device)
	"""
	Matrix Multiplication
	=====================
	In this tutorial, you will write a very short high-performance FP16 matrix multiplication kernel that achieves
	performance on par with cuBLAS or rocBLAS.

	You will specifically learn about:

	* Block-level matrix multiplications.
	import torch
	import sys

	device = torch.device('cpu')
	left = torch.zeros(100, device=device, requires_grad=True)
	right = torch.zeros(100, device=device, requires_grad=True)
	grad = torch.zeros(100, device=device)

	for _ in range(10):
	output = torch.add(left, right)
	import triton
	import pytest
	import torch
	import triton.language as tl
	import numpy as np
	from numpy.random import RandomState


	@pytest.mark.parametrize("M, N, K, num_warps, epilogue, allow_tf32, in_dtype, out_dtype, axis",
	[(*shape_nw, 'softmax', allow_tf32, in_dtype, out_dtype, axis)
	//
	// Generated by LLVM NVPTX Back-End
	//

	.version 8.0
	.target sm_80
	.address_size 64

	// .globl triton__0d1d2d3d4d56d7d89d1011d1213d1415d1617d1819d2021d2223d2425d2627d2829d3031d3233d3435d3637d3839d4041d42d
	.extern .shared .align 1 .b8 global_smem[];