functionstackx functionstackx

## benchmark_gemm.py
import time
import torch
import tabulate
from triton.testing import do_bench
import torch.nn.functional as F

torch.manual_seed(0)
repeats = 200
warmup = 30
dtype = torch.bfloat16

## tflops_iter.py
import time
import torch
import torch.utils.benchmark as benchmark
import matplotlib.pyplot as plt
import numpy as np
# patch of https://github.com/triton-lang/triton/pull/4493
def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fast_flush=True, return_mode="mean"):
    """
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.

## upgrade-cuda.bash
sudo apt-get purge -y --allow-change-held-packages "*nvidia*" "*cuda*" "*nccl*" && sudo apt-get -y  --allow-change-held-packages autoremove && sudo apt-mark showhold | xargs -r sudo apt-mark unhold
distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g')
wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update
sudo apt-get install -y cuda-drivers-fabricmanager-550 libnccl2=2.22.3-1+cuda12.4 libnccl-dev=2.22.3-1+cuda12.4 cuda-toolkit-12.4
sudo reboot

## amd_sweep.py
#DISABLE_ADDMM_HIP_LT=0
#PYTORCH_TUNABLEOP_ENABLED=1
# set env flag
import os
os.environ["DISABLE_ADDMM_HIP_LT"] = "0"
os.environ["PYTORCH_TUNABLEOP_ENABLED"] = "1"

import time
import torch
import torch.utils.benchmark as benchmark

## mem_bw.py
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Define the range of message sizes in bytes
sizes = [2**i for i in range(0, 35)]  # 1B to 16GB

# Define the number of iterations
iterations = 1000

## tflops.py
import time
import torch
import torch.utils.benchmark as benchmark
import matplotlib.pyplot as plt
import numpy as np
# patch of https://github.com/triton-lang/triton/pull/4493
def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fast_flush=True, return_mode="mean"):
    """
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.
	import time
	import torch
	import tabulate
	from triton.testing import do_bench
	import torch.nn.functional as F

	torch.manual_seed(0)
	repeats = 200
	warmup = 30
	dtype = torch.bfloat16
	import time
	import torch
	import torch.utils.benchmark as benchmark
	import matplotlib.pyplot as plt
	import numpy as np
	# patch of https://github.com/triton-lang/triton/pull/4493
	def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fast_flush=True, return_mode="mean"):
	"""
	Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
	the 20-th and 80-th performance percentile.
	sudo apt-get purge -y --allow-change-held-packages "nvidia" "cuda" "nccl" && sudo apt-get -y --allow-change-held-packages autoremove && sudo apt-mark showhold \| xargs -r sudo apt-mark unhold
	distribution=$(. /etc/os-release;echo $ID$VERSION_ID \| sed -e 's/\.//g')
	wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb
	sudo dpkg -i cuda-keyring_1.1-1_all.deb
	sudo apt update
	sudo apt-get install -y cuda-drivers-fabricmanager-550 libnccl2=2.22.3-1+cuda12.4 libnccl-dev=2.22.3-1+cuda12.4 cuda-toolkit-12.4
	sudo reboot
	#DISABLE_ADDMM_HIP_LT=0
	#PYTORCH_TUNABLEOP_ENABLED=1
	# set env flag
	import os
	os.environ["DISABLE_ADDMM_HIP_LT"] = "0"
	os.environ["PYTORCH_TUNABLEOP_ENABLED"] = "1"

	import time
	import torch
	import torch.utils.benchmark as benchmark
	import torch
	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np

	# Define the range of message sizes in bytes
	sizes = [2**i for i in range(0, 35)] # 1B to 16GB

	# Define the number of iterations
	iterations = 1000