Skip to content

Instantly share code, notes, and snippets.

View functionstackx's full-sized avatar

functionstackx functionstackx

View GitHub Profile
import time
import torch
import tabulate
from triton.testing import do_bench
import torch.nn.functional as F
torch.manual_seed(0)
repeats = 200
warmup = 30
dtype = torch.bfloat16
@functionstackx
functionstackx / tflops_iter.py
Created August 15, 2024 22:12
tflops_iter.py
import time
import torch
import torch.utils.benchmark as benchmark
import matplotlib.pyplot as plt
import numpy as np
# patch of https://github.com/triton-lang/triton/pull/4493
def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fast_flush=True, return_mode="mean"):
"""
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
the 20-th and 80-th performance percentile.
@functionstackx
functionstackx / upgrade-cuda.bash
Last active August 15, 2024 01:28
upgrade-cuda
sudo apt-get purge -y --allow-change-held-packages "*nvidia*" "*cuda*" "*nccl*" && sudo apt-get -y --allow-change-held-packages autoremove && sudo apt-mark showhold | xargs -r sudo apt-mark unhold
distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g')
wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update
sudo apt-get install -y cuda-drivers-fabricmanager-550 libnccl2=2.22.3-1+cuda12.4 libnccl-dev=2.22.3-1+cuda12.4 cuda-toolkit-12.4
sudo reboot
@functionstackx
functionstackx / amd_sweep.py
Last active August 8, 2024 19:05
matmul sweep
#DISABLE_ADDMM_HIP_LT=0
#PYTORCH_TUNABLEOP_ENABLED=1
# set env flag
import os
os.environ["DISABLE_ADDMM_HIP_LT"] = "0"
os.environ["PYTORCH_TUNABLEOP_ENABLED"] = "1"
import time
import torch
import torch.utils.benchmark as benchmark
@functionstackx
functionstackx / mem_bw.py
Created August 7, 2024 21:31
H100 Mem BW
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Define the range of message sizes in bytes
sizes = [2**i for i in range(0, 35)] # 1B to 16GB
# Define the number of iterations
iterations = 1000
import time
import torch
import torch.utils.benchmark as benchmark
import matplotlib.pyplot as plt
import numpy as np
# patch of https://github.com/triton-lang/triton/pull/4493
def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fast_flush=True, return_mode="mean"):
"""
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
the 20-th and 80-th performance percentile.