git clone https://github.com/triton-lang/triton.git && cd triton
pip install -e.
| ; ModuleID = 'LLVMDialectModule' | |
| source_filename = "LLVMDialectModule" | |
| target triple = "amdgcn-amd-amdhsa" | |
| @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 | |
| ; Function Attrs: mustprogress nofree norecurse nounwind willreturn | |
| define amdgpu_kernel void @flip_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 !dbg !4 { | |
| %3 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7 | |
| %4 = shl i32 %3, 2, !dbg !8 |
| #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> | |
| #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> | |
| #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}> | |
| #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}> | |
| #shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1], hasLeadingOffset = false}> | |
| #shared1 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}> | |
| module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { | |
| tt.func public @hoist_convert_above_extf_and_remat(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.pt |
| import torch | |
| import time | |
| import sys | |
| def run(nelems, iters): | |
| # Check if CUDA is available | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| tensor_a = torch.randn(nelems, dtype=torch.float32, device=device) |
| """ | |
| Matrix Multiplication | |
| ===================== | |
| In this tutorial, you will write a very short high-performance FP16 matrix multiplication kernel that achieves | |
| performance on par with cuBLAS or rocBLAS. | |
| You will specifically learn about: | |
| * Block-level matrix multiplications. |
| import torch | |
| import sys | |
| device = torch.device('cpu') | |
| left = torch.zeros(100, device=device, requires_grad=True) | |
| right = torch.zeros(100, device=device, requires_grad=True) | |
| grad = torch.zeros(100, device=device) | |
| for _ in range(10): | |
| output = torch.add(left, right) |
| import triton | |
| import pytest | |
| import torch | |
| import triton.language as tl | |
| import numpy as np | |
| from numpy.random import RandomState | |
| @pytest.mark.parametrize("M, N, K, num_warps, epilogue, allow_tf32, in_dtype, out_dtype, axis", | |
| [(*shape_nw, 'softmax', allow_tf32, in_dtype, out_dtype, axis) |
| // | |
| // Generated by LLVM NVPTX Back-End | |
| // | |
| .version 8.0 | |
| .target sm_80 | |
| .address_size 64 | |
| // .globl triton__0d1d2d3d4d56d7d89d1011d1213d1415d1617d1819d2021d2223d2425d2627d2829d3031d3233d3435d3637d3839d4041d42d | |
| .extern .shared .align 1 .b8 global_smem[]; |
Install
git clone https://github.com/openai/triton.git;
cd triton/python;
pip install cmake; # build time dependency
pip install -e .
pip uninstall pytorch-triton -yExpected result (-0.1250)
| // | |
| // Generated by LLVM NVPTX Back-End | |
| // | |
| .version 8.0 | |
| .target sm_80 | |
| .address_size 64 | |
| // .globl triton__0d1d2d3d | |
| .visible .entry triton__0d1d2d3d( | |
| .param .u64 triton__0d1d2d3d_param_0, | |
| .param .u64 triton__0d1d2d3d_param_1, |