Created
September 11, 2025 11:05
-
-
Save sytelus/4132c7d1b6701ac0e1e9e84ec6906e21 to your computer and use it in GitHub Desktop.
cuda_gdb debugging
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] Output code: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # AOT ID: ['13_backward'] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import torch | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import math | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import random | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import os | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import tempfile | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from math import inf, nan | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from cmath import nanj | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.utils import maybe_profile | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch import device, empty_strided | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_heuristics import start_graph, end_graph | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] aten = torch.ops.aten | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] inductor_ops = torch.ops.inductor | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] _quantized = torch.ops._quantized | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_alignment = torch._C._dynamo.guards.assert_alignment | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] async_compile = AsyncCompile() | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/ms/cmscq6hemtiewqcc52xcqrgfz4bkqd2iitx6c6t7eznluv7fi2ip.py | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # => constant_pad_nd_default_1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %constant_pad_nd_default_1 : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%permute_3, [0, 0, 0, 7]), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused_mm_0 = async_compile.triton('triton_poi_fused_mm_0', ''' | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise( | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] size_hints={'x': 67108864}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] filename=__file__, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] min_elem_per_thread=0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused_mm_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xnumel = 38602752 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xmask = xindex < xnumel | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] x1 = xindex // 768 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] x2 = xindex | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp0 = x1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp1 = tl.full([1], 50257, tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp2 = tmp0 < tmp1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp3 = tl.load(in_ptr0 + (x2), xmask & tmp2, other=0.0).to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tl.store(out_ptr0 + (x2), tmp3, xmask) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda') | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/jp/cjpwvnz2xrsi3ownjfjlmvlp46tajmsz36xltr4jnrxemjle2osn.py | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [cross_entropy, scatter, convert_element_type_default, view_5], Original ATen: [aten.nll_loss_backward, aten.nll_loss_forward, aten._log_softmax, aten._to_copy, aten._log_softmax_backward_data] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # convert_element_type_default => mul | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # cross_entropy => convert_element_type_4, convert_element_type_5, convert_element_type_6, full_default, full_default_1, sub, sub_1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # scatter => scatter_upon_const_tensor | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # view_5 => convert_element_type_11 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %div_1 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%tangents_1, %convert_element_type_7), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %ne_3 : [num_users=2] = call_function[target=torch.ops.aten.ne.Scalar](args = (%unsqueeze_1, -1), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([], 0), kwargs = {dtype: torch.int64, layout: torch.strided, device: cuda:0, pin_memory: False}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %where_2 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_3, %unsqueeze_1, %full_default), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %scatter_upon_const_tensor : [num_users=1] = call_function[target=torch._inductor.fx_passes.post_grad.scatter_upon_const_tensor](args = (), kwargs = {shape: [61440, 50257], background_val: 0, dtype: torch.float32, dim: 1, selector: %where_2, val: -1.0}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %full_default_1 : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([], 0.0), kwargs = {dtype: torch.float32, layout: torch.strided, device: cuda:0, pin_memory: False}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %where_3 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_3, %div_1, %full_default_1), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%scatter_upon_const_tensor, %where_3), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %convert_element_type_4 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_2, torch.float32), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type_4, %amax), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %sub_1 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%sub, %log), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %convert_element_type_5 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%sub_1, torch.bfloat16), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %convert_element_type_6 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%convert_element_type_5, torch.float32), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %exp_1 : [num_users=1] = call_function[target=torch.ops.aten.exp.default](args = (%convert_element_type_6,), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul, [1], True), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%exp_1, %sum_5), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %sub_2 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul, %mul_1), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %convert_element_type_11 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%sub_2, torch.bfloat16), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1 = async_compile.triton('triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1', ''' | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.reduction( | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] size_hints={'x': 65536, 'r0_': 65536}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] reduction_hint=ReductionHint.INNER, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] filename=__file__, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*i64', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i64', 'r0_numel': 'i64', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 1, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xnumel = 61440 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] r0_numel = 50257 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] rnumel = r0_numel | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xoffset = tl.program_id(0).to(tl.int64) * XBLOCK | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :].to(tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] rbase = r0_base | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] x0 = xindex | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last') | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp10 = tl.load(in_ptr1 + (0)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK]) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp12 = tl.load(in_ptr2 + (0)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp13 = tl.broadcast_to(tmp12, [XBLOCK, R0_BLOCK]) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] r0_index = r0_offset + r0_base | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] r0_mask = r0_index < r0_numel | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] roffset = r0_offset | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] rindex = r0_index | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] r0_1 = r0_index | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp1 = tl.full([1, 1], -1, tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp2 = tmp0 != tmp1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp3 = tl.full([1, 1], 0, tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp4 = tl.where(tmp2, tmp0, tmp3) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp5 = r0_1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp6 = tmp4 == tmp5 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp7 = -1.0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp8 = 0.0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp9 = tl.where(tmp6, tmp7, tmp8) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp14 = (tmp11 / tmp13) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp15 = tl.where(tmp2, tmp14, tmp8) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp16 = tmp9 * tmp15 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK]) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp19 = _tmp18 + tmp17 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] _tmp18 = tl.where(r0_mask, tmp19, _tmp18) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp18 = tl.sum(_tmp18, 1)[:, None] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp29 = tl.load(in_ptr1 + (0)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK]) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp31 = tl.load(in_ptr2 + (0)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp32 = tl.broadcast_to(tmp31, [XBLOCK, R0_BLOCK]) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp38 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last') | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp40 = tl.load(in_ptr4 + (x0), None, eviction_policy='evict_last') | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] r0_index = r0_offset + r0_base | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] r0_mask = r0_index < r0_numel | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] roffset = r0_offset | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] rindex = r0_index | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] r0_1 = r0_index | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp36 = tl.load(in_out_ptr0 + (r0_1 + 50304*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp20 = tl.full([1, 1], -1, tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp21 = tmp0 != tmp20 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp22 = tl.full([1, 1], 0, tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp23 = tl.where(tmp21, tmp0, tmp22) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp24 = r0_1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp25 = tmp23 == tmp24 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp26 = -1.0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp27 = 0.0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp28 = tl.where(tmp25, tmp26, tmp27) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp33 = (tmp30 / tmp32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp34 = tl.where(tmp21, tmp33, tmp27) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp35 = tmp28 * tmp34 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp37 = tmp36.to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp39 = tmp37 - tmp38 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp41 = tmp39 - tmp40 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp42 = tmp41.to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp43 = tmp42.to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp44 = tl_math.exp(tmp43) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp45 = tmp44 * tmp18 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp46 = tmp35 - tmp45 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp47 = tmp46.to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tl.store(in_out_ptr0 + (r0_1 + 50304*x0), tmp47, r0_mask) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda') | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/t6/ct6xhq5gq2cgp5hkswu325itqdthyt5houpbogw2z6mocmfnhzi5.py | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # => constant_pad_nd_default_2 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %constant_pad_nd_default_2 : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%permute_1, [0, 0, 0, 7]), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused_mm_2 = async_compile.triton('triton_poi_fused_mm_2', ''' | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise( | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] size_hints={'x': 4294967296}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] filename=__file__, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i64', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mm_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] min_elem_per_thread=0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused_mm_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xnumel = 3088220160 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xoffset = tl.program_id(0).to(tl.int64) * XBLOCK | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:].to(tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] x0 = (xindex % 50264) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] x1 = xindex // 50264 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp0 = x0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp1 = tl.full([1], 50257, tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp2 = tmp0 < tmp1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp3 = tl.load(in_ptr0 + (x0 + 50304*x1), tmp2, other=0.0).to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tl.store(out_ptr0 + (x0 + 50304*x1), tmp3, None) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda') | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/yw/cywftjyeclmllme7plpmhde2rtumxi3sawrretbg3ek77lvkrmsv.py | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # => constant_pad_nd_default | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %constant_pad_nd_default : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%convert_element_type_11, [0, 7, 0, 0]), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused_mm_3 = async_compile.triton('triton_poi_fused_mm_3', ''' | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise( | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] size_hints={'x': 524288}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] filename=__file__, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i64', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mm_3', 'mutated_arg_names': ['out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] min_elem_per_thread=0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused_mm_3(out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xnumel = 430080 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xoffset = tl.program_id(0).to(tl.int64) * XBLOCK | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:].to(tl.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] x0 = (xindex % 7) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] x1 = xindex // 7 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp0 = 0.0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tl.store(out_ptr0 + (50257 + x0 + 50304*x1), tmp0, None) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda') | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/4b/c4bwl7j636of4bx37f7h2qgrvfloe732nwq2y7uel2d522jevgrr.py | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten._to_copy] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %convert_element_type_17 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%slice_tensor, torch.float32), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused__to_copy_4 = async_compile.triton('triton_poi_fused__to_copy_4', ''' | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise( | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] size_hints={'x': 67108864}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] filename=__file__, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] min_elem_per_thread=0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused__to_copy_4(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xnumel = 38597376 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xmask = xindex < xnumel | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] x0 = xindex | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tl.store(out_ptr0 + (x0), tmp1, xmask) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda') | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/ld/cldyijt5qjsmvs6wuvjuul7ss34fij7xa5x62v7wnb3nurhmjpn6.py | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten._to_copy] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment: | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # %convert_element_type_16 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_6, torch.float32), kwargs = {}) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused__to_copy_5 = async_compile.triton('triton_poi_fused__to_copy_5', ''' | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise( | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] size_hints={'x': 67108864}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] filename=__file__, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] min_elem_per_thread=0 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused__to_copy_5(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xnumel = 47185920 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] x0 = xindex | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda') | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] async_compile.wait(globals()) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del async_compile | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def call(args): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] primals_3, view, mm_default_2, amax, log, convert_element_type_7, permute_3, tangents_1 = args | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] args.clear() | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride(primals_3, (60, 1024), (1024, 1)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride(view, (61440, 768), (768, 1)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride(mm_default_2, (61440, 50264), (50304, 1)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride(amax, (61440, 1), (1, 1)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride(log, (61440, 1), (1, 1)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride(convert_element_type_7, (), ()) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride(permute_3, (50257, 768), (768, 1)) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride(tangents_1, (), ()) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] with torch.cuda._DeviceGuard(0): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] torch.cuda.set_device(0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] buf5 = empty_strided_cuda((50264, 768), (768, 1), torch.bfloat16) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] stream0 = get_raw_stream(0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused_mm_0.run(permute_3, buf5, 38602752, stream=stream0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del permute_3 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] buf1 = reinterpret_tensor(mm_default_2, (61440, 50257), (50304, 1), 0); del mm_default_2 # reuse | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [cross_entropy, scatter, convert_element_type_default, view_5], Original ATen: [aten.nll_loss_backward, aten.nll_loss_forward, aten._log_softmax, aten._to_copy, aten._log_softmax_backward_data] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] stream0 = get_raw_stream(0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1.run(buf1, primals_3, tangents_1, convert_element_type_7, amax, log, 61440, 50257, stream=stream0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del amax | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del convert_element_type_7 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del log | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del primals_3 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del tangents_1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] buf2 = empty_strided_cuda((50264, 61440), (1, 50304), torch.bfloat16) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] stream0 = get_raw_stream(0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused_mm_2.run(buf1, buf2, 3088220160, stream=stream0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] stream0 = get_raw_stream(0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused_mm_3.run(buf1, 430080, stream=stream0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] buf3 = empty_strided_cuda((50264, 768), (768, 1), torch.bfloat16) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] extern_kernels.mm(buf2, view, out=buf3) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del buf2 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del view | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] buf6 = empty_strided_cuda((61440, 768), (768, 1), torch.bfloat16) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [, mm], Original ATen: [aten.mm] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] extern_kernels.mm(reinterpret_tensor(buf1, (61440, 50264), (50304, 1), 0), buf5, out=buf6) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del buf1 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del buf5 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] buf8 = empty_strided_cuda((50257, 768), (768, 1), torch.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten._to_copy] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] stream0 = get_raw_stream(0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused__to_copy_4.run(buf3, buf8, 38597376, stream=stream0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del buf3 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] buf7 = empty_strided_cuda((60, 1024, 768), (786432, 768, 1), torch.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten._to_copy] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] stream0 = get_raw_stream(0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused__to_copy_5.run(buf6, buf7, 47185920, stream=stream0) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del buf6 | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] return (buf7, buf8, None, ) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._dynamo.testing import rand_strided | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.utils import print_performance | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] primals_3 = rand_strided((60, 1024), (1024, 1), device='cuda:0', dtype=torch.int64) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] view = rand_strided((61440, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] mm_default_2 = rand_strided((61440, 50264), (50304, 1), device='cuda:0', dtype=torch.bfloat16) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] amax = rand_strided((61440, 1), (1, 1), device='cuda:0', dtype=torch.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] log = rand_strided((61440, 1), (1, 1), device='cuda:0', dtype=torch.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] convert_element_type_7 = rand_strided((), (), device='cuda:0', dtype=torch.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] permute_3 = rand_strided((50257, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] tangents_1 = rand_strided((), (), device='cuda:0', dtype=torch.float32) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] fn = lambda: call([primals_3, view, mm_default_2, amax, log, convert_element_type_7, permute_3, tangents_1]) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] if __name__ == "__main__": | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| [rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] | |
| [rank0]:V0911 10:18:42.005000 27347 torch/_inductor/codecache.py:1189] [0/0] [__output_code] Output code written to: /tmp/torchinductor_root/fl/cflma6p5e72qss2pb4zms4ed2cfcttdlkmklhjhb7yj5gouhmcrl.py | |
| [rank0]:W0911 10:18:42.013000 27347 torch/_inductor/debug.py:449] [0/0] model__13_backward_42 debug trace: /data/shitals/devbox/GitHubSrc/nanugpt/torch_compile_debug/run_2025_09_11_10_18_04_782379-pid_27347/torchinductor/model__13_backward_42.14 | |
| warning: Cuda Driver error detected: Failed to allocate physical memory | |
| warning: Cuda Driver error detected: Returning 1 (CUDA_ERROR_INVALID_VALUE) from cuMemHostAlloc | |
| [Switching to Thread 0x7fff323ff6c0 (LWP 27593)] | |
| Cuda Runtime API error detected: cudaHostAlloc returned cudaErrorInvalidValue(CUresult=1): invalid argument |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment