sytelus/stdout.log

## stdout.log
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] Output code:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # AOT ID: ['13_backward']
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from ctypes import c_void_p, c_long, c_int
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import torch
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import math
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import random
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import os
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import tempfile
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from math import inf, nan
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from cmath import nanj
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.utils import maybe_profile
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch import device, empty_strided
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.async_compile import AsyncCompile
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] aten = torch.ops.aten
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] inductor_ops = torch.ops.inductor
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] _quantized = torch.ops._quantized
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] assert_alignment = torch._C._dynamo.guards.assert_alignment
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] async_compile = AsyncCompile()
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/ms/cmscq6hemtiewqcc52xcqrgfz4bkqd2iitx6c6t7eznluv7fi2ip.py
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #    => constant_pad_nd_default_1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %constant_pad_nd_default_1 : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%permute_3, [0, 0, 0, 7]), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused_mm_0 = async_compile.triton('triton_poi_fused_mm_0', '''
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu()
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise(
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     size_hints={'x': 67108864},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     filename=__file__,
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     min_elem_per_thread=0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] )
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused_mm_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xnumel = 38602752
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xmask = xindex < xnumel
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     x1 = xindex // 768
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     x2 = xindex
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp0 = x1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp1 = tl.full([1], 50257, tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp2 = tmp0 < tmp1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp3 = tl.load(in_ptr0 + (x2), xmask & tmp2, other=0.0).to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tl.store(out_ptr0 + (x2), tmp3, xmask)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda')
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/jp/cjpwvnz2xrsi3ownjfjlmvlp46tajmsz36xltr4jnrxemjle2osn.py
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [cross_entropy, scatter, convert_element_type_default, view_5], Original ATen: [aten.nll_loss_backward, aten.nll_loss_forward, aten._log_softmax, aten._to_copy, aten._log_softmax_backward_data]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   convert_element_type_default => mul
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   cross_entropy => convert_element_type_4, convert_element_type_5, convert_element_type_6, full_default, full_default_1, sub, sub_1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   scatter => scatter_upon_const_tensor
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   view_5 => convert_element_type_11
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %div_1 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%tangents_1, %convert_element_type_7), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %ne_3 : [num_users=2] = call_function[target=torch.ops.aten.ne.Scalar](args = (%unsqueeze_1, -1), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([], 0), kwargs = {dtype: torch.int64, layout: torch.strided, device: cuda:0, pin_memory: False})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %where_2 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_3, %unsqueeze_1, %full_default), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %scatter_upon_const_tensor : [num_users=1] = call_function[target=torch._inductor.fx_passes.post_grad.scatter_upon_const_tensor](args = (), kwargs = {shape: [61440, 50257], background_val: 0, dtype: torch.float32, dim: 1, selector: %where_2, val: -1.0})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %full_default_1 : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([], 0.0), kwargs = {dtype: torch.float32, layout: torch.strided, device: cuda:0, pin_memory: False})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %where_3 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne_3, %div_1, %full_default_1), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%scatter_upon_const_tensor, %where_3), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %convert_element_type_4 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_2, torch.float32), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type_4, %amax), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %sub_1 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%sub, %log), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %convert_element_type_5 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%sub_1, torch.bfloat16), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %convert_element_type_6 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%convert_element_type_5, torch.float32), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %exp_1 : [num_users=1] = call_function[target=torch.ops.aten.exp.default](args = (%convert_element_type_6,), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul, [1], True), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%exp_1, %sum_5), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %sub_2 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul, %mul_1), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %convert_element_type_11 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%sub_2, torch.bfloat16), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1 = async_compile.triton('triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1', '''
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu()
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.reduction(
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     size_hints={'x': 65536, 'r0_': 65536},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     reduction_hint=ReductionHint.INNER,
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     filename=__file__,
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*i64', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i64', 'r0_numel': 'i64', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 1, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] )
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xnumel = 61440
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     r0_numel = 50257
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     rnumel = r0_numel
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :].to(tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     rbase = r0_base
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     x0 = xindex
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last')
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp10 = tl.load(in_ptr1 + (0))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK])
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp12 = tl.load(in_ptr2 + (0))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp13 = tl.broadcast_to(tmp12, [XBLOCK, R0_BLOCK])
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         r0_index = r0_offset + r0_base
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         r0_mask = r0_index < r0_numel
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         roffset = r0_offset
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         rindex = r0_index
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         r0_1 = r0_index
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp1 = tl.full([1, 1], -1, tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp2 = tmp0 != tmp1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp3 = tl.full([1, 1], 0, tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp4 = tl.where(tmp2, tmp0, tmp3)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp5 = r0_1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp6 = tmp4 == tmp5
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp7 = -1.0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp8 = 0.0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp9 = tl.where(tmp6, tmp7, tmp8)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp14 = (tmp11 / tmp13)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp15 = tl.where(tmp2, tmp14, tmp8)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp16 = tmp9 * tmp15
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK])
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp19 = _tmp18 + tmp17
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         _tmp18 = tl.where(r0_mask, tmp19, _tmp18)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp18 = tl.sum(_tmp18, 1)[:, None]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp29 = tl.load(in_ptr1 + (0))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK])
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp31 = tl.load(in_ptr2 + (0))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp32 = tl.broadcast_to(tmp31, [XBLOCK, R0_BLOCK])
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp38 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last')
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp40 = tl.load(in_ptr4 + (x0), None, eviction_policy='evict_last')
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         r0_index = r0_offset + r0_base
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         r0_mask = r0_index < r0_numel
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         roffset = r0_offset
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         rindex = r0_index
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         r0_1 = r0_index
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp36 = tl.load(in_out_ptr0 + (r0_1 + 50304*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp20 = tl.full([1, 1], -1, tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp21 = tmp0 != tmp20
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp22 = tl.full([1, 1], 0, tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp23 = tl.where(tmp21, tmp0, tmp22)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp24 = r0_1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp25 = tmp23 == tmp24
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp26 = -1.0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp27 = 0.0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp28 = tl.where(tmp25, tmp26, tmp27)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp33 = (tmp30 / tmp32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp34 = tl.where(tmp21, tmp33, tmp27)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp35 = tmp28 * tmp34
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp37 = tmp36.to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp39 = tmp37 - tmp38
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp41 = tmp39 - tmp40
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp42 = tmp41.to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp43 = tmp42.to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp44 = tl_math.exp(tmp43)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp45 = tmp44 * tmp18
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp46 = tmp35 - tmp45
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tmp47 = tmp46.to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         tl.store(in_out_ptr0 + (r0_1 + 50304*x0), tmp47, r0_mask)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda')
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/t6/ct6xhq5gq2cgp5hkswu325itqdthyt5houpbogw2z6mocmfnhzi5.py
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #    => constant_pad_nd_default_2
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %constant_pad_nd_default_2 : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%permute_1, [0, 0, 0, 7]), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused_mm_2 = async_compile.triton('triton_poi_fused_mm_2', '''
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu()
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise(
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     size_hints={'x': 4294967296},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     filename=__file__,
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i64', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mm_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     min_elem_per_thread=0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] )
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused_mm_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xnumel = 3088220160
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:].to(tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     x0 = (xindex % 50264)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     x1 = xindex // 50264
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp0 = x0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp1 = tl.full([1], 50257, tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp2 = tmp0 < tmp1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp3 = tl.load(in_ptr0 + (x0 + 50304*x1), tmp2, other=0.0).to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tl.store(out_ptr0 + (x0 + 50304*x1), tmp3, None)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda')
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/yw/cywftjyeclmllme7plpmhde2rtumxi3sawrretbg3ek77lvkrmsv.py
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #    => constant_pad_nd_default
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %constant_pad_nd_default : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%convert_element_type_11, [0, 7, 0, 0]), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused_mm_3 = async_compile.triton('triton_poi_fused_mm_3', '''
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu()
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise(
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     size_hints={'x': 524288},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     filename=__file__,
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i64', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mm_3', 'mutated_arg_names': ['out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     min_elem_per_thread=0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] )
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused_mm_3(out_ptr0, xnumel, XBLOCK : tl.constexpr):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xnumel = 430080
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:].to(tl.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     x0 = (xindex % 7)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     x1 = xindex // 7
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp0 = 0.0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tl.store(out_ptr0 + (50257 + x0 + 50304*x1), tmp0, None)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda')
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/4b/c4bwl7j636of4bx37f7h2qgrvfloe732nwq2y7uel2d522jevgrr.py
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten._to_copy]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %convert_element_type_17 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%slice_tensor, torch.float32), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused__to_copy_4 = async_compile.triton('triton_poi_fused__to_copy_4', '''
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu()
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise(
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     size_hints={'x': 67108864},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     filename=__file__,
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     min_elem_per_thread=0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] )
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused__to_copy_4(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xnumel = 38597376
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xmask = xindex < xnumel
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     x0 = xindex
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp1 = tmp0.to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, xmask)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda')
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # kernel path: /tmp/torchinductor_root/ld/cldyijt5qjsmvs6wuvjuul7ss34fij7xa5x62v7wnb3nurhmjpn6.py
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Topologically Sorted Source Nodes: [], Original ATen: [aten._to_copy]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Source node to ATen node mapping:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] # Graph fragment:
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] #   %convert_element_type_16 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_6, torch.float32), kwargs = {})
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_poi_fused__to_copy_5 = async_compile.triton('triton_poi_fused__to_copy_5', '''
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] import triton.language as tl
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] triton_helpers.set_driver_to_gpu()
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton_heuristics.pointwise(
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     size_hints={'x': 67108864},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     filename=__file__,
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=148, cc=100, major=10, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'E74AF08FEBEF1F0F99888D35475CC9BF2391352C6B703B6DC1A406265B028997', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     min_elem_per_thread=0
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] )
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] @triton.jit
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def triton_poi_fused__to_copy_5(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xnumel = 47185920
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     x0 = xindex
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tmp1 = tmp0.to(tl.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] ''', device_str='cuda')
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] async_compile.wait(globals())
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] del async_compile
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def call(args):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     primals_3, view, mm_default_2, amax, log, convert_element_type_7, permute_3, tangents_1 = args
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     args.clear()
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     assert_size_stride(primals_3, (60, 1024), (1024, 1))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     assert_size_stride(view, (61440, 768), (768, 1))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     assert_size_stride(mm_default_2, (61440, 50264), (50304, 1))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     assert_size_stride(amax, (61440, 1), (1, 1))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     assert_size_stride(log, (61440, 1), (1, 1))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     assert_size_stride(convert_element_type_7, (), ())
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     assert_size_stride(permute_3, (50257, 768), (768, 1))
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     assert_size_stride(tangents_1, (), ())
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     with torch.cuda._DeviceGuard(0):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         torch.cuda.set_device(0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         buf5 = empty_strided_cuda((50264, 768), (768, 1), torch.bfloat16)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         stream0 = get_raw_stream(0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         triton_poi_fused_mm_0.run(permute_3, buf5, 38602752, stream=stream0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del permute_3
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         buf1 = reinterpret_tensor(mm_default_2, (61440, 50257), (50304, 1), 0); del mm_default_2  # reuse
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         # Topologically Sorted Source Nodes: [cross_entropy, scatter, convert_element_type_default, view_5], Original ATen: [aten.nll_loss_backward, aten.nll_loss_forward, aten._log_softmax, aten._to_copy, aten._log_softmax_backward_data]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         stream0 = get_raw_stream(0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         triton_red_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_1.run(buf1, primals_3, tangents_1, convert_element_type_7, amax, log, 61440, 50257, stream=stream0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del amax
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del convert_element_type_7
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del log
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del primals_3
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del tangents_1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         buf2 = empty_strided_cuda((50264, 61440), (1, 50304), torch.bfloat16)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         stream0 = get_raw_stream(0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         triton_poi_fused_mm_2.run(buf1, buf2, 3088220160, stream=stream0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         stream0 = get_raw_stream(0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         triton_poi_fused_mm_3.run(buf1, 430080, stream=stream0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         buf3 = empty_strided_cuda((50264, 768), (768, 1), torch.bfloat16)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         extern_kernels.mm(buf2, view, out=buf3)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del buf2
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del view
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         buf6 = empty_strided_cuda((61440, 768), (768, 1), torch.bfloat16)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         # Topologically Sorted Source Nodes: [, mm], Original ATen: [aten.mm]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         extern_kernels.mm(reinterpret_tensor(buf1, (61440, 50264), (50304, 1), 0), buf5, out=buf6)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del buf1
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del buf5
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         buf8 = empty_strided_cuda((50257, 768), (768, 1), torch.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         # Topologically Sorted Source Nodes: [], Original ATen: [aten._to_copy]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         stream0 = get_raw_stream(0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         triton_poi_fused__to_copy_4.run(buf3, buf8, 38597376, stream=stream0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del buf3
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         buf7 = empty_strided_cuda((60, 1024, 768), (786432, 768, 1), torch.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         # Topologically Sorted Source Nodes: [], Original ATen: [aten._to_copy]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         stream0 = get_raw_stream(0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         triton_poi_fused__to_copy_5.run(buf6, buf7, 47185920, stream=stream0)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]         del buf6
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     return (buf7, buf8, None, )
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     from torch._dynamo.testing import rand_strided
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     from torch._inductor.utils import print_performance
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     primals_3 = rand_strided((60, 1024), (1024, 1), device='cuda:0', dtype=torch.int64)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     view = rand_strided((61440, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     mm_default_2 = rand_strided((61440, 50264), (50304, 1), device='cuda:0', dtype=torch.bfloat16)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     amax = rand_strided((61440, 1), (1, 1), device='cuda:0', dtype=torch.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     log = rand_strided((61440, 1), (1, 1), device='cuda:0', dtype=torch.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     convert_element_type_7 = rand_strided((), (), device='cuda:0', dtype=torch.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     permute_3 = rand_strided((50257, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     tangents_1 = rand_strided((), (), device='cuda:0', dtype=torch.float32)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     fn = lambda: call([primals_3, view, mm_default_2, amax, log, convert_element_type_7, permute_3, tangents_1])
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code] if __name__ == "__main__":
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
[rank0]:V0911 10:18:41.998000 27347 torch/_inductor/codecache.py:1188] [0/0] [__output_code]
[rank0]:V0911 10:18:42.005000 27347 torch/_inductor/codecache.py:1189] [0/0] [__output_code] Output code written to: /tmp/torchinductor_root/fl/cflma6p5e72qss2pb4zms4ed2cfcttdlkmklhjhb7yj5gouhmcrl.py
[rank0]:W0911 10:18:42.013000 27347 torch/_inductor/debug.py:449] [0/0] model__13_backward_42 debug trace: /data/shitals/devbox/GitHubSrc/nanugpt/torch_compile_debug/run_2025_09_11_10_18_04_782379-pid_27347/torchinductor/model__13_backward_42.14
warning: Cuda Driver error detected: Failed to allocate physical memory
warning: Cuda Driver error detected: Returning 1 (CUDA_ERROR_INVALID_VALUE) from cuMemHostAlloc
[Switching to Thread 0x7fff323ff6c0 (LWP 27593)]
Cuda Runtime API error detected: cudaHostAlloc returned cudaErrorInvalidValue(CUresult=1): invalid argument
No results found