Skip to content

Instantly share code, notes, and snippets.

(pytorch) [shunting@devgpu011.ldc3 ~/ws/helion (all-reduce)]$ python test/test_distributed.py -k TestDistributed.test_matmul_reduce_scatter_autotuner_LFBOTreeSearch
INFO: Started process 0 with pid 2143318
INFO: Started process 1 with pid 2143323
INFO: Started process 2 with pid 2143330
INFO: Started process 3 with pid 2143331
NCCL version 2.28.9+cuda12.9
[rank2]:[W310 23:33:39.020723478 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
[rank2]:[W310 23:33:39.020748826 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
[rank1]:[W310 23:33:39.030396315 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
[rank1]:[W310 23:33:39.030415704 CUDASymmetricMemory.cu:804] Warning: Pointer not within any Symmet
from __future__ import annotations
import torch
import helion.language as hl
import triton
import triton.language as tl
from torch._inductor.runtime.triton_compat import libdevice
from helion.runtime import default_launcher as _default_launcher
import __main__ as _source_module
import triton
import triton.language as tl
from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
triton_helpers.set_driver_to_gpu()
import triton
import triton.language as tl
from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
triton_helpers.set_driver_to_gpu()
import torch
from torch import nn
from torch import distributed
import contextlib
import os
from vllm import LLM, SamplingParams
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index 0748643a5..bbab180ae 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -7,6 +7,7 @@ from collections.abc import Callable
from contextlib import ExitStack
from typing import Any
from unittest.mock import patch
+from vllm.forward_context import ForwardContext, get_forward_context
This file has been truncated, but you can view the full file.
--- /home/shunting/runnable.py 2026-03-02 15:48:29.094844374 -0800
+++ /home/shunting/old_runnable.py 2026-03-02 15:51:57.647109947 -0800
@@ -5,14 +5,12 @@
os.environ['TORCHELASTIC_ENABLE_FILE_TIMER'] = '1'
os.environ['TORCH_NCCL_DESYNC_DEBUG'] = '1'
os.environ['TORCH_NCCL_RETHROW_CUDA_ERRORS'] = '0'
-os.environ['TORCHX_INTERNAL_SESSION_ID'] = '05e3f00a-a074-49de-a031-c987eb224489'
+os.environ['TORCHX_INTERNAL_SESSION_ID'] = '9f52e4a7-0c61-472a-8fb2-b3a3e913eb2e'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
-os.environ['TORCHX_JOB_ID'] = 'quickflow://aps/aps-f1042578221-1042601092'
# AOT ID: ['0_backward']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from cmath import nanj
from torch._inductor.hooks import run_intermediate_hooks
graph():
%x : [num_users=1] = call_function[target=helion.language._tracing_ops._host_tensor](args = (x,), kwargs = {})
%block_size_0 : [num_users=3] = call_function[target=helion.language._tracing_ops._get_symnode](args = (block_size_0,), kwargs = {})
%block_size_1 : [num_users=3] = call_function[target=helion.language._tracing_ops._get_symnode](args = (block_size_1,), kwargs = {})
%load : [num_users=1] = call_function[target=helion.language.memory_ops.load](args = (%x, [%block_size_0, %block_size_1], None, None), kwargs = {}
)
%y : [num_users=1] = call_function[target=helion.language._tracing_ops._host_tensor](args = (y,), kwargs = {})
%load_1 : [num_users=1] = call_function[target=helion.language.memory_ops.load](args = (%y, [%block_size_0, %block_size_1], None, None), kwargs =
{})
%add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%load, %load_1), kwargs = {})
import torch
import json
N = 2 ** 28
glist = []
mempool = torch.cuda.graph_pool_handle()
for _ in range(3):
g = torch.cuda.CUDAGraph()
glist.append(g)