kiya00/Llama-3.1-8B

## Llama-3.1-8B

root@4316cdb72fd3:/app/tensorrt_llm/TensorRT-LLM# trtllm-bench --model $MODEL_ID throughput --dataset /tmp/synthetic_128_128.txt --backend _autodeploy
2025-06-13 11:47:57,126 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend
[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc1
[06/13/2025-11:47:57] [TRT-LLM] [I] Preparing to run throughput benchmark...
Parse safetensors files:   0%|                                                                                     | 0/4 [00:00<?, ?it/Parse safetensors files:  25%|███████████████████▎                                                         | 1/4 [00:00<00:00,  7.04it/Parse safetensors files: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 28.14it/s]
[06/13/2025-11:47:58] [TRT-LLM] [I]
===========================================================
= DATASET DETAILS
===========================================================
Dataset Path:         /tmp/synthetic_128_128.txt
Number of Sequences:  3000

-- Percentiles statistics ---------------------------------

        Input              Output           Seq. Length
-----------------------------------------------------------
MIN:   128.0000           128.0000           256.0000
MAX:   128.0000           128.0000           256.0000
AVG:   128.0000           128.0000           256.0000
P50:   128.0000           128.0000           256.0000
P90:   128.0000           128.0000           256.0000
P95:   128.0000           128.0000           256.0000
P99:   128.0000           128.0000           256.0000
===========================================================

Parse safetensors files:   0%|                                                                                     | 0/4 [00:00<?, ?it/Parse safetensors files:  25%|███████████████████▎                                                         | 1/4 [00:00<00:00,  7.78it/Parse safetensors files:  50%|██████████████████████████████████████▌                                      | 2/4 [00:00<00:00,  5.14it/Parse safetensors files: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 10.83it/s]
[06/13/2025-11:47:59] [TRT-LLM] [I] Validating KV Cache config against kv_cache_dtype="auto"
[06/13/2025-11:47:59] [TRT-LLM] [I] KV cache quantization set to "auto". Using checkpoint KV quantization.
[06/13/2025-11:47:59] [TRT-LLM] [I] Estimated engine size: 14.96 GB
[06/13/2025-11:47:59] [TRT-LLM] [I] Estimated total available memory for KV cache: 64.69 GB
[06/13/2025-11:47:59] [TRT-LLM] [I] Estimated total KV cache memory: 61.46 GB
[06/13/2025-11:47:59] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 1966.57
[06/13/2025-11:47:59] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 2048
[06/13/2025-11:47:59] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 4096
[06/13/2025-11:47:59] [TRT-LLM] [I] Max batch size and max num tokens not provided. Using heuristics or pre-defined settings: max_batch_size=2048, max_num_tokens=4096.
[06/13/2025-11:47:59] [TRT-LLM] [I] Setting PyTorch max sequence length to 256
[06/13/2025-11:47:59] [TRT-LLM] [I] Setting up throughput benchmark.
[06/13/2025-11:47:59] [TRT-LLM] [W] Using default gpus_per_node: 8
[06/13/2025-11:47:59] [TRT-LLM] [I] Set nccl_plugin to None.
[06/13/2025-11:48:00] [TRT-LLM] [I] model='meta-llama/Llama-3.1-8B' tokenizer=None tokenizer_mode='auto' skip_tokenizer_init=True trust_remote_code=True tensor_parallel_size=1 dtype='auto' revision=None tokenizer_revision=None pipeline_parallel_size=1 context_parallel_size=1 gpus_per_node=8 moe_cluster_parallel_size=-1 moe_tensor_parallel_size=-1 moe_expert_parallel_size=-1 enable_attention_dp=False cp_config={} load_format=<LoadFormat.AUTO: 0> enable_lora=False max_lora_rank=None max_loras=4 max_cpu_loras=4 lora_config=None enable_prompt_adapter=False max_prompt_adapter_token=0 quant_config=QuantConfig(quant_algo=None, kv_cache_quant_algo=None, group_size=128, smoothquant_val=0.5, clamp_val=None, use_meta_recipe=False, has_zero_point=False, pre_quant_scale=False, exclude_modules=None) kv_cache_config=KvCacheConfig(enable_block_reuse=False, max_tokens=None, max_attention_window=None, sink_token_length=None, free_gpu_memory_fraction=0.9, host_cache_size=None, onboard_blocks=True, cross_kv_cache_fraction=None, secondary_offload_min_priority=None, event_buffer_max_size=0, enable_partial_reuse=True, copy_on_partial_reuse=True) enable_chunked_prefill=False guided_decoding_backend=None batched_logits_processor=None iter_stats_max_iterations=None request_stats_max_iterations=None peft_cache_config=None scheduler_config=SchedulerConfig(capacity_scheduler_policy=<CapacitySchedulerPolicy.GUARANTEED_NO_EVICT: 'GUARANTEED_NO_EVICT'>, context_chunking_policy=None, dynamic_batch_config=DynamicBatchConfig(enable_batch_size_tuning=True, enable_max_num_tokens_tuning=False, dynamic_batch_moving_average_window=128)) cache_transceiver_config=None speculative_config=None batching_type=<BatchingType.INFLIGHT: 'INFLIGHT'> normalize_log_probs=False max_batch_size=2048 max_input_len=1024 max_seq_len=256 max_beam_width=1 max_num_tokens=4096 backend='_autodeploy' gather_generation_logits=False num_postprocess_workers=0 postprocess_tokenizer_dir=None reasoning_parser=None decoding_config=None mpi_session=None build_config=BuildConfig(max_input_len=1024, max_seq_len=None, opt_batch_size=8, max_batch_size=2048, max_beam_width=1, max_num_tokens=4096, opt_num_tokens=None, max_prompt_embedding_table_size=0, kv_cache_type=None, gather_context_logits=False, gather_generation_logits=False, strongly_typed=True, force_num_profiles=None, profiling_verbosity='layer_names_only', enable_debug_output=False, max_draft_len=0, speculative_decoding_mode=<SpeculativeDecodingMode.NONE: 1>, use_refit=False, input_timing_cache=None, output_timing_cache='model.cache', lora_config=LoraConfig(lora_dir=[], lora_ckpt_source='hf', max_lora_rank=64, lora_target_modules=[], trtllm_modules_to_hf_modules={}, max_loras=4, max_cpu_loras=4), auto_parallel_config=AutoParallelConfig(world_size=1, gpus_per_node=8, cluster_key=None, cluster_info=None, sharding_cost_model=<CostModel.ALPHA_BETA: 'alpha_beta'>, comm_cost_model=<CostModel.ALPHA_BETA: 'alpha_beta'>, enable_pipeline_parallelism=False, enable_shard_unbalanced_shape=False, enable_shard_dynamic_shape=False, enable_reduce_scatter=True, builder_flags=None, debug_mode=False, infer_shape=True, validation_mode=False, same_buffer_io={}, same_spec_io={}, sharded_io_allowlist=[], fill_weights=False, parallel_config_cache=None, profile_cache=None, dump_path=None, debug_outputs=[]), weight_sparsity=False, weight_streaming=False, plugin_config=PluginConfig(_dtype='float16', _bert_attention_plugin='auto', _gpt_attention_plugin='auto', _gemm_plugin=None, _explicitly_disable_gemm_plugin=False, _gemm_swiglu_plugin=None, _fp8_rowwise_gemm_plugin=None, _qserve_gemm_plugin=None, _identity_plugin=None, _nccl_plugin=None, _lora_plugin=None, _dora_plugin=False, _weight_only_groupwise_quant_matmul_plugin=None, _weight_only_quant_matmul_plugin=None, _smooth_quant_plugins=True, _smooth_quant_gemm_plugin=None, _layernorm_quantization_plugin=None, _rmsnorm_quantization_plugin=None, _quantize_per_token_plugin=False, _quantize_tensor_plugin=False, _moe_plugin='auto', _mamba_conv1d_plugin='auto', _low_latency_gemm_plugin=None, _low_latency_gemm_swiglu_plugin=None, _gemm_allreduce_plugin=None, _context_fmha=True, _bert_context_fmha_fp32_acc=False, _paged_kv_cache=None, _remove_input_padding=True, _norm_quant_fusion=False, _reduce_fusion=False, _user_buffer=False, _tokens_per_block=32, _use_paged_context_fmha=True, _use_fp8_context_fmha=True, _fuse_fp4_quant=False, _multiple_profiles=False, _paged_state=True, _streamingllm=False, _manage_weights=False, _use_fused_mlp=True, _pp_reduce_scatter=False), use_strip_plan=False, max_encoder_input_len=1024, dry_run=False, visualize_network=None, monitor_memory=False, use_mrope=False) use_cuda_graph=True cuda_graph_batch_sizes=[1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 256, 512, 1024, 2048] cuda_graph_max_batch_size=2048 cuda_graph_padding_enabled=True disable_overlap_scheduler=False moe_max_num_tokens=None moe_load_balancer=None attn_backend='FlashInfer' moe_backend='CUTLASS' mixed_sampler=False enable_trtllm_sampler=False kv_cache_dtype='auto' use_kv_cache=True enable_iter_perf_stats=False enable_iter_req_stats=False print_iter_log=False torch_compile_enabled=True torch_compile_fullgraph=True torch_compile_inductor_enabled=False torch_compile_piecewise_cuda_graph=False torch_compile_enable_userbuffers=True autotuner_enabled=True enable_layerwise_nvtx_marker=False auto_deploy_config=None enable_min_latency=False model_factory='AutoModelForCausalLM' model_kwargs={'max_position_embeddings': 256} mla_backend='MultiHeadLatentAttention' skip_loading_weights=False free_mem_ratio=0.8 simple_shard_only=False attn_page_size=64 checkpoint_device=None extended_runtime_perf_knob_config=ExtendedRuntimePerfKnobConfig(multi_block_mode=True, enable_context_fmha_fp32_acc=False, cuda_graph_mode=True, cuda_graph_cache_size=1000) parallel_config=_ParallelConfig(tp_size=1, pp_size=1, cp_size=1, gpus_per_node=8, moe_cluster_size=-1, moe_tp_size=-1, moe_ep_size=-1, cp_config={}, enable_attention_dp=False, auto_parallel=False, _world_size=1, _devices=None) model_format=<_ModelFormatKind.HF: 0> speculative_model=None
rank 0 using MpiPoolSession to spawn MPI processes
[06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue
[06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server worker_init_status_queue
[06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue
[06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue
[06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue
2025-06-13 11:48:08,993 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend
[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc1
[TensorRT-LLM][INFO] Refreshed the MPI local session
[06/13/2025-11:48:09] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Initializing for: lib='OMPI', local_rank=0, world_size=1, port=57435
[06/13/2025-11:48:09] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] max_seq_len=256, max_batch_size=2048, attn_page_size=64, max_num_tokens=4096
/root/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B/snapshots/d04e592bb4f6aa9cfee91e2e20afa771667e1d4b
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
[06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] No quantization to do.
[06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 MoE Patterns
[06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 64 repeat_kv patterns
[06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 eager attention patterns
[06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 grouped attention patterns
[06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 causal mask attention patterns
[06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found and matched 32 attention layouts
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
[06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found and matched 32 RoPE patterns
[06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Match RoPE layout to bsnd
[06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 RoPE layout matches
[06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found and eliminated 192 redundant transpose pairs
[06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 RoPE optimizations
[06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Skipping sharding for single device
[06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Skipping sharding for single device
[06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Skipping sharding for single device
[06/13/2025-11:48:19] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Loading and initializing weights.
[06/13/2025-11:48:24] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 allreduce+residual+rmsnorm fusions
[06/13/2025-11:48:24] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 GEMM+Collective fusions
[06/13/2025-11:48:24] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Starting RMSNorm pattern matching with backend: triton
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
[06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] RMSNorm pattern count: 65
[06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 2 input nodes and 1 output nodes
[06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Added 4 new input nodes for cached attention metadata
[06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Replaced 32 attention.bsnd_grouped_sdpa ops with attention.flashinfer_mha_with_cache
[06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Initialized 65 caches for cached attention
[06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory ratio: 0.8
[06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory (MB): 59648 , Total memory (MB): 80994
[06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Current cache size (MB): 512, Current num pages: 64
[06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory before forward pass (MB): 59648
2025-06-13 11:48:26,720 - INFO - flashinfer.jit: Loading JIT ops: rope
2025-06-13 11:48:26,732 - INFO - flashinfer.jit: Finished loading JIT ops: rope
2025-06-13 11:48:26,733 - INFO - flashinfer.jit: Loading JIT ops: page
2025-06-13 11:48:26,742 - INFO - flashinfer.jit: Finished loading JIT ops: page
2025-06-13 11:48:26,748 - INFO - flashinfer.jit: Loading JIT ops: batch_prefill_with_kv_cache_dtype_q_bf16_dtype_kv_bf16_dtype_o_bf16_dtype_idx_i32_head_dim_qk_128_head_dim_vo_128_posenc_0_use_swa_False_use_logits_cap_False_f16qk_False
2025-06-13 11:48:26,758 - INFO - flashinfer.jit: Finished loading JIT ops: batch_prefill_with_kv_cache_dtype_q_bf16_dtype_kv_bf16_dtype_o_bf16_dtype_idx_i32_head_dim_qk_128_head_dim_vo_128_posenc_0_use_swa_False_use_logits_cap_False_f16qk_False
[06/13/2025-11:48:26] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory after forward pass (MB): 59578
[06/13/2025-11:48:26] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Memory for forward pass (MB): 70
[06/13/2025-11:48:28] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] After all_gather - new_num_pages: 6021
[06/13/2025-11:48:28] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Fusion before compiling...
[06/13/2025-11:48:28] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Compiling for torch-opt backend...
[06/13/2025-11:48:32] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 1
[06/13/2025-11:48:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 2
[06/13/2025-11:48:39] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 4
[06/13/2025-11:48:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 8
[06/13/2025-11:48:47] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 16
[06/13/2025-11:48:50] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 24
[06/13/2025-11:48:54] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 32
[06/13/2025-11:48:58] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 40
[rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8] torch._dynamo hit config.recompile_limit (8)
[rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8]    function: 'forward' (<eval_with_key>.1480:4)
[rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8]    last reason: 0/7: tensor 'L['input_ids']' size mismatch at index 0. expected 32, actual 40
[rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
[rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
[06/13/2025-11:48:58] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 48
[06/13/2025-11:48:59] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 56
[06/13/2025-11:48:59] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 64
[06/13/2025-11:48:59] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 72
[06/13/2025-11:49:00] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 80
[06/13/2025-11:49:00] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 88
[06/13/2025-11:49:00] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 96
[06/13/2025-11:49:01] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 104
[06/13/2025-11:49:01] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 112
[06/13/2025-11:49:02] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 120
[06/13/2025-11:49:02] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 128
[06/13/2025-11:49:02] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 256
[06/13/2025-11:49:03] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 512
[06/13/2025-11:49:03] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 1024
[06/13/2025-11:49:03] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 2048
[06/13/2025-11:49:04] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Compile time with backend torch-opt: 36.402899 seconds
[06/13/2025-11:49:04] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Using fake cache manager with head_dim=0 and num pages: 6021
[TensorRT-LLM][INFO] Max KV cache pages per sequence: 4 [window size=256]
[TensorRT-LLM][INFO] Number of tokens per block: 64.
[TensorRT-LLM][INFO] [MemUsageChange] Allocated 0.00 GiB for max tokens in paged KV cache (385344).
[06/13/2025-11:49:05] [TRT-LLM] [I] Setting up for warmup...
[06/13/2025-11:49:05] [TRT-LLM] [I] Running warmup.
[06/13/2025-11:49:05] [TRT-LLM] [I] Starting benchmarking async task.
[06/13/2025-11:49:05] [TRT-LLM] [I] Starting benchmark...
[06/13/2025-11:49:05] [TRT-LLM] [I] Request submission complete. [count=2, time=0.0000s, rate=156727.53 req/s]
[06/13/2025-11:49:07] [TRT-LLM] [I] Benchmark complete.
[06/13/2025-11:49:07] [TRT-LLM] [I] Stopping LLM backend.
[06/13/2025-11:49:07] [TRT-LLM] [I] Cancelling all 0 tasks to complete.
[06/13/2025-11:49:07] [TRT-LLM] [I] All tasks cancelled.
[06/13/2025-11:49:07] [TRT-LLM] [I] LLM Backend stopped.
[06/13/2025-11:49:07] [TRT-LLM] [I] Worker task cancelled.
[06/13/2025-11:49:07] [TRT-LLM] [I] Warmup done.
[06/13/2025-11:49:07] [TRT-LLM] [I] No log path provided, skipping logging.
[06/13/2025-11:49:07] [TRT-LLM] [I] Starting benchmarking async task.
[06/13/2025-11:49:07] [TRT-LLM] [I] Starting benchmark...
[06/13/2025-11:49:07] [TRT-LLM] [I] Request submission complete. [count=3000, time=0.0014s, rate=2153096.66 req/s]
Traceback (most recent call last):
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/pyexecutor/py_executor.py", line 1612, in _forward_step
    outputs = forward(scheduled_requests, self.resource_manager,
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/nvtx/nvtx.py", line 122, in inner
    result = func(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/pyexecutor/py_executor.py", line 1602, in forward
    return self.model_engine.forward(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py", line 251, in forward
    last_logit_only = self._prepare_inputs(scheduled_requests, resource_manager, new_tokens)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/nvtx/nvtx.py", line 122, in inner
    result = func(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py", line 205, in _prepare_inputs
    new_tokens_list = new_tokens.cpu().tolist() if new_tokens is not None else None
                      ^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

[06/13/2025-11:49:17] [TRT-LLM] [E] Encountered an error in forward function: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Traceback (most recent call last):
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/pyexecutor/py_executor.py", line 1686, in _update_requests
    self.sampler.update_requests(sample_state)
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/pyexecutor/sampler.py", line 241, in update_requests
    state.sampler_event.synchronize()
  File "/usr/local/lib/python3.12/dist-packages/torch/cuda/streams.py", line 227, in synchronize
    super().synchronize()
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

[06/13/2025-11:49:17] [TRT-LLM] [E] Encountered an error in sampling: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

terminate called after throwing an instance of 'c10::Error'
  what():  CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Exception raised from c10_cuda_check_implementation at /opt/pytorch/pytorch/c10/cuda/CUDAException.cpp:43 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x7f8903d8b5e8 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xe0 (0x7f8903d204a2 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so)
frame #2: c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) + 0x3c2 (0x7f890ea1b2a2 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10_cuda.so)
frame #3: <unknown function> + 0xb7d311 (0x7f889f923311 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0xb794eb (0x7f889f91f4eb in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so)
frame #5: <unknown function> + 0xb80c04 (0x7f889f926c04 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0x44c162 (0x7f8902c57162 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #7: c10::TensorImpl::~TensorImpl() + 0x9 (0x7f8903d65f39 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so)
frame #8: <unknown function> + 0x703468 (0x7f8902f0e468 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #9: <unknown function> + 0x703890 (0x7f8902f0e890 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #10: /usr/bin/python() [0x579cf2]
frame #11: /usr/bin/python() [0x59f0b9]
frame #12: /usr/bin/python() [0x579d52]
frame #13: /usr/bin/python() [0x59f0b9]
frame #14: /usr/bin/python() [0x579cf2]
frame #15: /usr/bin/python() [0x59f0b9]
frame #16: _PyEval_EvalFrameDefault + 0x681d (0x5dd15d in /usr/bin/python)
frame #17: /usr/bin/python() [0x54cd32]
frame #18: _PyEval_EvalFrameDefault + 0x4c1b (0x5db55b in /usr/bin/python)
frame #19: /usr/bin/python() [0x54cd32]
frame #20: /usr/bin/python() [0x6f826c]
frame #21: /usr/bin/python() [0x6b917c]
frame #22: <unknown function> + 0x9caa4 (0x7f8b86174aa4 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #23: __clone + 0x44 (0x7f8b86201a34 in /usr/lib/x86_64-linux-gnu/libc.so.6)

[4316cdb72fd3:29519] *** Process received signal ***
[4316cdb72fd3:29519] Signal: Aborted (6)
[4316cdb72fd3:29519] Signal code:  (-6)
[4316cdb72fd3:29519] [ 0] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x45330)[0x7f8b8611d330]
[4316cdb72fd3:29519] [ 1] /usr/lib/x86_64-linux-gnu/libc.so.6(pthread_kill+0x11c)[0x7f8b86176b2c]
[4316cdb72fd3:29519] [ 2] /usr/lib/x86_64-linux-gnu/libc.so.6(gsignal+0x1e)[0x7f8b8611d27e]
[4316cdb72fd3:29519] [ 3] /usr/lib/x86_64-linux-gnu/libc.so.6(abort+0xdf)[0x7f8b861008ff]
[4316cdb72fd3:29519] [ 4] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa5ff5)[0x7f89156a8ff5]
[4316cdb72fd3:29519] [ 5] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xbb0da)[0x7f89156be0da]
[4316cdb72fd3:29519] [ 6] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(__cxa_call_terminate+0x33)[0x7f89156a88e6]
[4316cdb72fd3:29519] [ 7] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x31a)[0x7f89156bd8ba]
[4316cdb72fd3:29519] [ 8] /usr/lib/x86_64-linux-gnu/libgcc_s.so.1(+0x22b06)[0x7f89483fcb06]
[4316cdb72fd3:29519] [ 9] /usr/lib/x86_64-linux-gnu/libgcc_s.so.1(_Unwind_Resume+0x12d)[0x7f89483fd5cd]
[4316cdb72fd3:29519] [10] /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so(+0xb810b8)[0x7f889f9270b8]
[4316cdb72fd3:29519] [11] /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0x44c162)[0x7f8902c57162]
[4316cdb72fd3:29519] [12] /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so(_ZN3c1010TensorImplD0Ev+0x9)[0x7f8903d65f39]
[4316cdb72fd3:29519] [13] /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0x703468)[0x7f8902f0e468]
[4316cdb72fd3:29519] [14] /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0x703890)[0x7f8902f0e890]
[4316cdb72fd3:29519] [15] /usr/bin/python[0x579cf2]
[4316cdb72fd3:29519] [16] /usr/bin/python[0x59f0b9]
[4316cdb72fd3:29519] [17] /usr/bin/python[0x579d52]
[4316cdb72fd3:29519] [18] /usr/bin/python[0x59f0b9]
[4316cdb72fd3:29519] [19] /usr/bin/python[0x579cf2]
[4316cdb72fd3:29519] [20] /usr/bin/python[0x59f0b9]
[4316cdb72fd3:29519] [21] /usr/bin/python(_PyEval_EvalFrameDefault+0x681d)[0x5dd15d]
[4316cdb72fd3:29519] [22] /usr/bin/python[0x54cd32]
[4316cdb72fd3:29519] [23] /usr/bin/python(_PyEval_EvalFrameDefault+0x4c1b)[0x5db55b]
[4316cdb72fd3:29519] [24] /usr/bin/python[0x54cd32]
[4316cdb72fd3:29519] [25] /usr/bin/python[0x6f826c]
[4316cdb72fd3:29519] [26] /usr/bin/python[0x6b917c]
[4316cdb72fd3:29519] [27] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x9caa4)[0x7f8b86174aa4]
[4316cdb72fd3:29519] [28] /usr/lib/x86_64-linux-gnu/libc.so.6(__clone+0x44)[0x7f8b86201a34]
[4316cdb72fd3:29519] *** End of error message ***
--------------------------------------------------------------------------
Child job 2 terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
^C[06/13/2025-11:57:08] [TRT-LLM] [I] Stopping LLM backend.
[06/13/2025-11:57:08] [TRT-LLM] [I] Cancelling all 3000 tasks to complete.
[06/13/2025-11:57:08] [TRT-LLM] [I] All tasks cancelled.
[06/13/2025-11:57:08] [TRT-LLM] [I] LLM Backend stopped.
[06/13/2025-11:57:09] [TRT-LLM] [I] Worker task cancelled.
[06/13/2025-11:57:09] [TRT-LLM] [I] Benchmark done. Reporting results...
^C
Aborted!
^C^CException ignored in: <function LLM.__del__ at 0x7f79898c8ea0>
Traceback (most recent call last):
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/llmapi/llm.py", line 792, in __del__
    self.shutdown()
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/llmapi/llm.py", line 766, in shutdown
    self._executor.shutdown()
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/executor/proxy.py", line 364, in shutdown
    self.dispatch_result_thread.join()
  File "/usr/lib/python3.12/threading.py", line 1147, in join
    self._wait_for_tstate_lock()
  File "/usr/lib/python3.12/threading.py", line 1167, in _wait_for_tstate_lock
    if lock.acquire(block, timeout):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt:
^CException ignored in: <module 'threading' from '/usr/lib/python3.12/threading.py'>
Traceback (most recent call last):
  File "/usr/lib/python3.12/threading.py", line 1592, in _shutdown
    atexit_call()
  File "/usr/local/lib/python3.12/dist-packages/mpi4py/futures/_lib.py", line 121, in join_threads
    thread.join()
  File "/usr/lib/python3.12/threading.py", line 1147, in join
    self._wait_for_tstate_lock()
  File "/usr/lib/python3.12/threading.py", line 1167, in _wait_for_tstate_lock
    if lock.acquire(block, timeout):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt:
^C^CException ignored in atexit callback: <bound method GenerationExecutorProxy.shutdown of <tensorrt_llm.executor.proxy.GenerationExecutorProxy object at 0x7f7980cd1a00>>
Traceback (most recent call last):
  File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/executor/proxy.py", line 368, in shutdown
    self.dispatch_stats_thread.join()
  File "/usr/lib/python3.12/threading.py", line 1147, in join
    self._wait_for_tstate_lock()
  File "/usr/lib/python3.12/threading.py", line 1167, in _wait_for_tstate_lock
    if lock.acquire(block, timeout):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt:
--------------------------------------------------------------------------
(null) noticed that process rank 0 with PID 0 on node 4316cdb72fd3 exited on signal 6 (Aborted).
No results found