Last active
June 13, 2025 12:05
-
-
Save kiya00/2e3cedaed06f22419647b3ef594c01f9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| root@4316cdb72fd3:/app/tensorrt_llm/TensorRT-LLM# trtllm-bench --model $MODEL_ID throughput --dataset /tmp/synthetic_128_128.txt --backend _autodeploy | |
| 2025-06-13 11:47:57,126 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend | |
| [TensorRT-LLM] TensorRT-LLM version: 0.21.0rc1 | |
| [06/13/2025-11:47:57] [TRT-LLM] [I] Preparing to run throughput benchmark... | |
| Parse safetensors files: 0%| | 0/4 [00:00<?, ?it/Parse safetensors files: 25%|███████████████████▎ | 1/4 [00:00<00:00, 7.04it/Parse safetensors files: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 28.14it/s] | |
| [06/13/2025-11:47:58] [TRT-LLM] [I] | |
| =========================================================== | |
| = DATASET DETAILS | |
| =========================================================== | |
| Dataset Path: /tmp/synthetic_128_128.txt | |
| Number of Sequences: 3000 | |
| -- Percentiles statistics --------------------------------- | |
| Input Output Seq. Length | |
| ----------------------------------------------------------- | |
| MIN: 128.0000 128.0000 256.0000 | |
| MAX: 128.0000 128.0000 256.0000 | |
| AVG: 128.0000 128.0000 256.0000 | |
| P50: 128.0000 128.0000 256.0000 | |
| P90: 128.0000 128.0000 256.0000 | |
| P95: 128.0000 128.0000 256.0000 | |
| P99: 128.0000 128.0000 256.0000 | |
| =========================================================== | |
| Parse safetensors files: 0%| | 0/4 [00:00<?, ?it/Parse safetensors files: 25%|███████████████████▎ | 1/4 [00:00<00:00, 7.78it/Parse safetensors files: 50%|██████████████████████████████████████▌ | 2/4 [00:00<00:00, 5.14it/Parse safetensors files: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 10.83it/s] | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Validating KV Cache config against kv_cache_dtype="auto" | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] KV cache quantization set to "auto". Using checkpoint KV quantization. | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Estimated engine size: 14.96 GB | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Estimated total available memory for KV cache: 64.69 GB | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Estimated total KV cache memory: 61.46 GB | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 1966.57 | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 2048 | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 4096 | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Max batch size and max num tokens not provided. Using heuristics or pre-defined settings: max_batch_size=2048, max_num_tokens=4096. | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Setting PyTorch max sequence length to 256 | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Setting up throughput benchmark. | |
| [06/13/2025-11:47:59] [TRT-LLM] [W] Using default gpus_per_node: 8 | |
| [06/13/2025-11:47:59] [TRT-LLM] [I] Set nccl_plugin to None. | |
| [06/13/2025-11:48:00] [TRT-LLM] [I] model='meta-llama/Llama-3.1-8B' tokenizer=None tokenizer_mode='auto' skip_tokenizer_init=True trust_remote_code=True tensor_parallel_size=1 dtype='auto' revision=None tokenizer_revision=None pipeline_parallel_size=1 context_parallel_size=1 gpus_per_node=8 moe_cluster_parallel_size=-1 moe_tensor_parallel_size=-1 moe_expert_parallel_size=-1 enable_attention_dp=False cp_config={} load_format=<LoadFormat.AUTO: 0> enable_lora=False max_lora_rank=None max_loras=4 max_cpu_loras=4 lora_config=None enable_prompt_adapter=False max_prompt_adapter_token=0 quant_config=QuantConfig(quant_algo=None, kv_cache_quant_algo=None, group_size=128, smoothquant_val=0.5, clamp_val=None, use_meta_recipe=False, has_zero_point=False, pre_quant_scale=False, exclude_modules=None) kv_cache_config=KvCacheConfig(enable_block_reuse=False, max_tokens=None, max_attention_window=None, sink_token_length=None, free_gpu_memory_fraction=0.9, host_cache_size=None, onboard_blocks=True, cross_kv_cache_fraction=None, secondary_offload_min_priority=None, event_buffer_max_size=0, enable_partial_reuse=True, copy_on_partial_reuse=True) enable_chunked_prefill=False guided_decoding_backend=None batched_logits_processor=None iter_stats_max_iterations=None request_stats_max_iterations=None peft_cache_config=None scheduler_config=SchedulerConfig(capacity_scheduler_policy=<CapacitySchedulerPolicy.GUARANTEED_NO_EVICT: 'GUARANTEED_NO_EVICT'>, context_chunking_policy=None, dynamic_batch_config=DynamicBatchConfig(enable_batch_size_tuning=True, enable_max_num_tokens_tuning=False, dynamic_batch_moving_average_window=128)) cache_transceiver_config=None speculative_config=None batching_type=<BatchingType.INFLIGHT: 'INFLIGHT'> normalize_log_probs=False max_batch_size=2048 max_input_len=1024 max_seq_len=256 max_beam_width=1 max_num_tokens=4096 backend='_autodeploy' gather_generation_logits=False num_postprocess_workers=0 postprocess_tokenizer_dir=None reasoning_parser=None decoding_config=None mpi_session=None build_config=BuildConfig(max_input_len=1024, max_seq_len=None, opt_batch_size=8, max_batch_size=2048, max_beam_width=1, max_num_tokens=4096, opt_num_tokens=None, max_prompt_embedding_table_size=0, kv_cache_type=None, gather_context_logits=False, gather_generation_logits=False, strongly_typed=True, force_num_profiles=None, profiling_verbosity='layer_names_only', enable_debug_output=False, max_draft_len=0, speculative_decoding_mode=<SpeculativeDecodingMode.NONE: 1>, use_refit=False, input_timing_cache=None, output_timing_cache='model.cache', lora_config=LoraConfig(lora_dir=[], lora_ckpt_source='hf', max_lora_rank=64, lora_target_modules=[], trtllm_modules_to_hf_modules={}, max_loras=4, max_cpu_loras=4), auto_parallel_config=AutoParallelConfig(world_size=1, gpus_per_node=8, cluster_key=None, cluster_info=None, sharding_cost_model=<CostModel.ALPHA_BETA: 'alpha_beta'>, comm_cost_model=<CostModel.ALPHA_BETA: 'alpha_beta'>, enable_pipeline_parallelism=False, enable_shard_unbalanced_shape=False, enable_shard_dynamic_shape=False, enable_reduce_scatter=True, builder_flags=None, debug_mode=False, infer_shape=True, validation_mode=False, same_buffer_io={}, same_spec_io={}, sharded_io_allowlist=[], fill_weights=False, parallel_config_cache=None, profile_cache=None, dump_path=None, debug_outputs=[]), weight_sparsity=False, weight_streaming=False, plugin_config=PluginConfig(_dtype='float16', _bert_attention_plugin='auto', _gpt_attention_plugin='auto', _gemm_plugin=None, _explicitly_disable_gemm_plugin=False, _gemm_swiglu_plugin=None, _fp8_rowwise_gemm_plugin=None, _qserve_gemm_plugin=None, _identity_plugin=None, _nccl_plugin=None, _lora_plugin=None, _dora_plugin=False, _weight_only_groupwise_quant_matmul_plugin=None, _weight_only_quant_matmul_plugin=None, _smooth_quant_plugins=True, _smooth_quant_gemm_plugin=None, _layernorm_quantization_plugin=None, _rmsnorm_quantization_plugin=None, _quantize_per_token_plugin=False, _quantize_tensor_plugin=False, _moe_plugin='auto', _mamba_conv1d_plugin='auto', _low_latency_gemm_plugin=None, _low_latency_gemm_swiglu_plugin=None, _gemm_allreduce_plugin=None, _context_fmha=True, _bert_context_fmha_fp32_acc=False, _paged_kv_cache=None, _remove_input_padding=True, _norm_quant_fusion=False, _reduce_fusion=False, _user_buffer=False, _tokens_per_block=32, _use_paged_context_fmha=True, _use_fp8_context_fmha=True, _fuse_fp4_quant=False, _multiple_profiles=False, _paged_state=True, _streamingllm=False, _manage_weights=False, _use_fused_mlp=True, _pp_reduce_scatter=False), use_strip_plan=False, max_encoder_input_len=1024, dry_run=False, visualize_network=None, monitor_memory=False, use_mrope=False) use_cuda_graph=True cuda_graph_batch_sizes=[1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 256, 512, 1024, 2048] cuda_graph_max_batch_size=2048 cuda_graph_padding_enabled=True disable_overlap_scheduler=False moe_max_num_tokens=None moe_load_balancer=None attn_backend='FlashInfer' moe_backend='CUTLASS' mixed_sampler=False enable_trtllm_sampler=False kv_cache_dtype='auto' use_kv_cache=True enable_iter_perf_stats=False enable_iter_req_stats=False print_iter_log=False torch_compile_enabled=True torch_compile_fullgraph=True torch_compile_inductor_enabled=False torch_compile_piecewise_cuda_graph=False torch_compile_enable_userbuffers=True autotuner_enabled=True enable_layerwise_nvtx_marker=False auto_deploy_config=None enable_min_latency=False model_factory='AutoModelForCausalLM' model_kwargs={'max_position_embeddings': 256} mla_backend='MultiHeadLatentAttention' skip_loading_weights=False free_mem_ratio=0.8 simple_shard_only=False attn_page_size=64 checkpoint_device=None extended_runtime_perf_knob_config=ExtendedRuntimePerfKnobConfig(multi_block_mode=True, enable_context_fmha_fp32_acc=False, cuda_graph_mode=True, cuda_graph_cache_size=1000) parallel_config=_ParallelConfig(tp_size=1, pp_size=1, cp_size=1, gpus_per_node=8, moe_cluster_size=-1, moe_tp_size=-1, moe_ep_size=-1, cp_config={}, enable_attention_dp=False, auto_parallel=False, _world_size=1, _devices=None) model_format=<_ModelFormatKind.HF: 0> speculative_model=None | |
| rank 0 using MpiPoolSession to spawn MPI processes | |
| [06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue | |
| [06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server worker_init_status_queue | |
| [06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue | |
| [06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue | |
| [06/13/2025-11:48:00] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue | |
| 2025-06-13 11:48:08,993 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend | |
| [TensorRT-LLM] TensorRT-LLM version: 0.21.0rc1 | |
| [TensorRT-LLM][INFO] Refreshed the MPI local session | |
| [06/13/2025-11:48:09] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Initializing for: lib='OMPI', local_rank=0, world_size=1, port=57435 | |
| [06/13/2025-11:48:09] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] max_seq_len=256, max_batch_size=2048, attn_page_size=64, max_num_tokens=4096 | |
| /root/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B/snapshots/d04e592bb4f6aa9cfee91e2e20afa771667e1d4b | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| [06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] No quantization to do. | |
| [06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 MoE Patterns | |
| [06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 64 repeat_kv patterns | |
| [06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 eager attention patterns | |
| [06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 grouped attention patterns | |
| [06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 causal mask attention patterns | |
| [06/13/2025-11:48:16] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found and matched 32 attention layouts | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| [06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found and matched 32 RoPE patterns | |
| [06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Match RoPE layout to bsnd | |
| [06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 RoPE layout matches | |
| [06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found and eliminated 192 redundant transpose pairs | |
| [06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 RoPE optimizations | |
| [06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Skipping sharding for single device | |
| [06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Skipping sharding for single device | |
| [06/13/2025-11:48:17] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Skipping sharding for single device | |
| [06/13/2025-11:48:19] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Loading and initializing weights. | |
| [06/13/2025-11:48:24] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 allreduce+residual+rmsnorm fusions | |
| [06/13/2025-11:48:24] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 GEMM+Collective fusions | |
| [06/13/2025-11:48:24] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Starting RMSNorm pattern matching with backend: triton | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| /usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.) | |
| torch._C._set_onednn_allow_tf32(_allow_tf32) | |
| [06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] RMSNorm pattern count: 65 | |
| [06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 2 input nodes and 1 output nodes | |
| [06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Added 4 new input nodes for cached attention metadata | |
| [06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Replaced 32 attention.bsnd_grouped_sdpa ops with attention.flashinfer_mha_with_cache | |
| [06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Initialized 65 caches for cached attention | |
| [06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory ratio: 0.8 | |
| [06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory (MB): 59648 , Total memory (MB): 80994 | |
| [06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Current cache size (MB): 512, Current num pages: 64 | |
| [06/13/2025-11:48:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory before forward pass (MB): 59648 | |
| 2025-06-13 11:48:26,720 - INFO - flashinfer.jit: Loading JIT ops: rope | |
| 2025-06-13 11:48:26,732 - INFO - flashinfer.jit: Finished loading JIT ops: rope | |
| 2025-06-13 11:48:26,733 - INFO - flashinfer.jit: Loading JIT ops: page | |
| 2025-06-13 11:48:26,742 - INFO - flashinfer.jit: Finished loading JIT ops: page | |
| 2025-06-13 11:48:26,748 - INFO - flashinfer.jit: Loading JIT ops: batch_prefill_with_kv_cache_dtype_q_bf16_dtype_kv_bf16_dtype_o_bf16_dtype_idx_i32_head_dim_qk_128_head_dim_vo_128_posenc_0_use_swa_False_use_logits_cap_False_f16qk_False | |
| 2025-06-13 11:48:26,758 - INFO - flashinfer.jit: Finished loading JIT ops: batch_prefill_with_kv_cache_dtype_q_bf16_dtype_kv_bf16_dtype_o_bf16_dtype_idx_i32_head_dim_qk_128_head_dim_vo_128_posenc_0_use_swa_False_use_logits_cap_False_f16qk_False | |
| [06/13/2025-11:48:26] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory after forward pass (MB): 59578 | |
| [06/13/2025-11:48:26] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Memory for forward pass (MB): 70 | |
| [06/13/2025-11:48:28] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] After all_gather - new_num_pages: 6021 | |
| [06/13/2025-11:48:28] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Fusion before compiling... | |
| [06/13/2025-11:48:28] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Compiling for torch-opt backend... | |
| [06/13/2025-11:48:32] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 1 | |
| [06/13/2025-11:48:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 2 | |
| [06/13/2025-11:48:39] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 4 | |
| [06/13/2025-11:48:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 8 | |
| [06/13/2025-11:48:47] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 16 | |
| [06/13/2025-11:48:50] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 24 | |
| [06/13/2025-11:48:54] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 32 | |
| [06/13/2025-11:48:58] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 40 | |
| [rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8] torch._dynamo hit config.recompile_limit (8) | |
| [rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8] function: 'forward' (<eval_with_key>.1480:4) | |
| [rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8] last reason: 0/7: tensor 'L['input_ids']' size mismatch at index 0. expected 32, actual 40 | |
| [rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles". | |
| [rank0]:W0613 11:48:58.348000 29519 torch/_dynamo/convert_frame.py:961] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html. | |
| [06/13/2025-11:48:58] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 48 | |
| [06/13/2025-11:48:59] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 56 | |
| [06/13/2025-11:48:59] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 64 | |
| [06/13/2025-11:48:59] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 72 | |
| [06/13/2025-11:49:00] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 80 | |
| [06/13/2025-11:49:00] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 88 | |
| [06/13/2025-11:49:00] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 96 | |
| [06/13/2025-11:49:01] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 104 | |
| [06/13/2025-11:49:01] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 112 | |
| [06/13/2025-11:49:02] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 120 | |
| [06/13/2025-11:49:02] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 128 | |
| [06/13/2025-11:49:02] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 256 | |
| [06/13/2025-11:49:03] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 512 | |
| [06/13/2025-11:49:03] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 1024 | |
| [06/13/2025-11:49:03] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 2048 | |
| [06/13/2025-11:49:04] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Compile time with backend torch-opt: 36.402899 seconds | |
| [06/13/2025-11:49:04] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Using fake cache manager with head_dim=0 and num pages: 6021 | |
| [TensorRT-LLM][INFO] Max KV cache pages per sequence: 4 [window size=256] | |
| [TensorRT-LLM][INFO] Number of tokens per block: 64. | |
| [TensorRT-LLM][INFO] [MemUsageChange] Allocated 0.00 GiB for max tokens in paged KV cache (385344). | |
| [06/13/2025-11:49:05] [TRT-LLM] [I] Setting up for warmup... | |
| [06/13/2025-11:49:05] [TRT-LLM] [I] Running warmup. | |
| [06/13/2025-11:49:05] [TRT-LLM] [I] Starting benchmarking async task. | |
| [06/13/2025-11:49:05] [TRT-LLM] [I] Starting benchmark... | |
| [06/13/2025-11:49:05] [TRT-LLM] [I] Request submission complete. [count=2, time=0.0000s, rate=156727.53 req/s] | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] Benchmark complete. | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] Stopping LLM backend. | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] Cancelling all 0 tasks to complete. | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] All tasks cancelled. | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] LLM Backend stopped. | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] Worker task cancelled. | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] Warmup done. | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] No log path provided, skipping logging. | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] Starting benchmarking async task. | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] Starting benchmark... | |
| [06/13/2025-11:49:07] [TRT-LLM] [I] Request submission complete. [count=3000, time=0.0014s, rate=2153096.66 req/s] | |
| Traceback (most recent call last): | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/pyexecutor/py_executor.py", line 1612, in _forward_step | |
| outputs = forward(scheduled_requests, self.resource_manager, | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/usr/local/lib/python3.12/dist-packages/nvtx/nvtx.py", line 122, in inner | |
| result = func(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^ | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/pyexecutor/py_executor.py", line 1602, in forward | |
| return self.model_engine.forward( | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context | |
| return func(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^ | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py", line 251, in forward | |
| last_logit_only = self._prepare_inputs(scheduled_requests, resource_manager, new_tokens) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/usr/local/lib/python3.12/dist-packages/nvtx/nvtx.py", line 122, in inner | |
| result = func(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^ | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py", line 205, in _prepare_inputs | |
| new_tokens_list = new_tokens.cpu().tolist() if new_tokens is not None else None | |
| ^^^^^^^^^^^^^^^^ | |
| RuntimeError: CUDA error: an illegal memory access was encountered | |
| CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. | |
| For debugging consider passing CUDA_LAUNCH_BLOCKING=1 | |
| Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. | |
| [06/13/2025-11:49:17] [TRT-LLM] [E] Encountered an error in forward function: CUDA error: an illegal memory access was encountered | |
| CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. | |
| For debugging consider passing CUDA_LAUNCH_BLOCKING=1 | |
| Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. | |
| Traceback (most recent call last): | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/pyexecutor/py_executor.py", line 1686, in _update_requests | |
| self.sampler.update_requests(sample_state) | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/_torch/pyexecutor/sampler.py", line 241, in update_requests | |
| state.sampler_event.synchronize() | |
| File "/usr/local/lib/python3.12/dist-packages/torch/cuda/streams.py", line 227, in synchronize | |
| super().synchronize() | |
| RuntimeError: CUDA error: an illegal memory access was encountered | |
| CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. | |
| For debugging consider passing CUDA_LAUNCH_BLOCKING=1 | |
| Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. | |
| [06/13/2025-11:49:17] [TRT-LLM] [E] Encountered an error in sampling: CUDA error: an illegal memory access was encountered | |
| CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. | |
| For debugging consider passing CUDA_LAUNCH_BLOCKING=1 | |
| Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. | |
| terminate called after throwing an instance of 'c10::Error' | |
| what(): CUDA error: an illegal memory access was encountered | |
| CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. | |
| For debugging consider passing CUDA_LAUNCH_BLOCKING=1 | |
| Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. | |
| Exception raised from c10_cuda_check_implementation at /opt/pytorch/pytorch/c10/cuda/CUDAException.cpp:43 (most recent call first): | |
| frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x98 (0x7f8903d8b5e8 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
| frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xe0 (0x7f8903d204a2 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
| frame #2: c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) + 0x3c2 (0x7f890ea1b2a2 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10_cuda.so) | |
| frame #3: <unknown function> + 0xb7d311 (0x7f889f923311 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
| frame #4: <unknown function> + 0xb794eb (0x7f889f91f4eb in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
| frame #5: <unknown function> + 0xb80c04 (0x7f889f926c04 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) | |
| frame #6: <unknown function> + 0x44c162 (0x7f8902c57162 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
| frame #7: c10::TensorImpl::~TensorImpl() + 0x9 (0x7f8903d65f39 in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) | |
| frame #8: <unknown function> + 0x703468 (0x7f8902f0e468 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
| frame #9: <unknown function> + 0x703890 (0x7f8902f0e890 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so) | |
| frame #10: /usr/bin/python() [0x579cf2] | |
| frame #11: /usr/bin/python() [0x59f0b9] | |
| frame #12: /usr/bin/python() [0x579d52] | |
| frame #13: /usr/bin/python() [0x59f0b9] | |
| frame #14: /usr/bin/python() [0x579cf2] | |
| frame #15: /usr/bin/python() [0x59f0b9] | |
| frame #16: _PyEval_EvalFrameDefault + 0x681d (0x5dd15d in /usr/bin/python) | |
| frame #17: /usr/bin/python() [0x54cd32] | |
| frame #18: _PyEval_EvalFrameDefault + 0x4c1b (0x5db55b in /usr/bin/python) | |
| frame #19: /usr/bin/python() [0x54cd32] | |
| frame #20: /usr/bin/python() [0x6f826c] | |
| frame #21: /usr/bin/python() [0x6b917c] | |
| frame #22: <unknown function> + 0x9caa4 (0x7f8b86174aa4 in /usr/lib/x86_64-linux-gnu/libc.so.6) | |
| frame #23: __clone + 0x44 (0x7f8b86201a34 in /usr/lib/x86_64-linux-gnu/libc.so.6) | |
| [4316cdb72fd3:29519] *** Process received signal *** | |
| [4316cdb72fd3:29519] Signal: Aborted (6) | |
| [4316cdb72fd3:29519] Signal code: (-6) | |
| [4316cdb72fd3:29519] [ 0] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x45330)[0x7f8b8611d330] | |
| [4316cdb72fd3:29519] [ 1] /usr/lib/x86_64-linux-gnu/libc.so.6(pthread_kill+0x11c)[0x7f8b86176b2c] | |
| [4316cdb72fd3:29519] [ 2] /usr/lib/x86_64-linux-gnu/libc.so.6(gsignal+0x1e)[0x7f8b8611d27e] | |
| [4316cdb72fd3:29519] [ 3] /usr/lib/x86_64-linux-gnu/libc.so.6(abort+0xdf)[0x7f8b861008ff] | |
| [4316cdb72fd3:29519] [ 4] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa5ff5)[0x7f89156a8ff5] | |
| [4316cdb72fd3:29519] [ 5] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xbb0da)[0x7f89156be0da] | |
| [4316cdb72fd3:29519] [ 6] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(__cxa_call_terminate+0x33)[0x7f89156a88e6] | |
| [4316cdb72fd3:29519] [ 7] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x31a)[0x7f89156bd8ba] | |
| [4316cdb72fd3:29519] [ 8] /usr/lib/x86_64-linux-gnu/libgcc_s.so.1(+0x22b06)[0x7f89483fcb06] | |
| [4316cdb72fd3:29519] [ 9] /usr/lib/x86_64-linux-gnu/libgcc_s.so.1(_Unwind_Resume+0x12d)[0x7f89483fd5cd] | |
| [4316cdb72fd3:29519] [10] /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so(+0xb810b8)[0x7f889f9270b8] | |
| [4316cdb72fd3:29519] [11] /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0x44c162)[0x7f8902c57162] | |
| [4316cdb72fd3:29519] [12] /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so(_ZN3c1010TensorImplD0Ev+0x9)[0x7f8903d65f39] | |
| [4316cdb72fd3:29519] [13] /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0x703468)[0x7f8902f0e468] | |
| [4316cdb72fd3:29519] [14] /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0x703890)[0x7f8902f0e890] | |
| [4316cdb72fd3:29519] [15] /usr/bin/python[0x579cf2] | |
| [4316cdb72fd3:29519] [16] /usr/bin/python[0x59f0b9] | |
| [4316cdb72fd3:29519] [17] /usr/bin/python[0x579d52] | |
| [4316cdb72fd3:29519] [18] /usr/bin/python[0x59f0b9] | |
| [4316cdb72fd3:29519] [19] /usr/bin/python[0x579cf2] | |
| [4316cdb72fd3:29519] [20] /usr/bin/python[0x59f0b9] | |
| [4316cdb72fd3:29519] [21] /usr/bin/python(_PyEval_EvalFrameDefault+0x681d)[0x5dd15d] | |
| [4316cdb72fd3:29519] [22] /usr/bin/python[0x54cd32] | |
| [4316cdb72fd3:29519] [23] /usr/bin/python(_PyEval_EvalFrameDefault+0x4c1b)[0x5db55b] | |
| [4316cdb72fd3:29519] [24] /usr/bin/python[0x54cd32] | |
| [4316cdb72fd3:29519] [25] /usr/bin/python[0x6f826c] | |
| [4316cdb72fd3:29519] [26] /usr/bin/python[0x6b917c] | |
| [4316cdb72fd3:29519] [27] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x9caa4)[0x7f8b86174aa4] | |
| [4316cdb72fd3:29519] [28] /usr/lib/x86_64-linux-gnu/libc.so.6(__clone+0x44)[0x7f8b86201a34] | |
| [4316cdb72fd3:29519] *** End of error message *** | |
| -------------------------------------------------------------------------- | |
| Child job 2 terminated normally, but 1 process returned | |
| a non-zero exit code. Per user-direction, the job has been aborted. | |
| -------------------------------------------------------------------------- | |
| ^C[06/13/2025-11:57:08] [TRT-LLM] [I] Stopping LLM backend. | |
| [06/13/2025-11:57:08] [TRT-LLM] [I] Cancelling all 3000 tasks to complete. | |
| [06/13/2025-11:57:08] [TRT-LLM] [I] All tasks cancelled. | |
| [06/13/2025-11:57:08] [TRT-LLM] [I] LLM Backend stopped. | |
| [06/13/2025-11:57:09] [TRT-LLM] [I] Worker task cancelled. | |
| [06/13/2025-11:57:09] [TRT-LLM] [I] Benchmark done. Reporting results... | |
| ^C | |
| Aborted! | |
| ^C^CException ignored in: <function LLM.__del__ at 0x7f79898c8ea0> | |
| Traceback (most recent call last): | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/llmapi/llm.py", line 792, in __del__ | |
| self.shutdown() | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/llmapi/llm.py", line 766, in shutdown | |
| self._executor.shutdown() | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/executor/proxy.py", line 364, in shutdown | |
| self.dispatch_result_thread.join() | |
| File "/usr/lib/python3.12/threading.py", line 1147, in join | |
| self._wait_for_tstate_lock() | |
| File "/usr/lib/python3.12/threading.py", line 1167, in _wait_for_tstate_lock | |
| if lock.acquire(block, timeout): | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| KeyboardInterrupt: | |
| ^CException ignored in: <module 'threading' from '/usr/lib/python3.12/threading.py'> | |
| Traceback (most recent call last): | |
| File "/usr/lib/python3.12/threading.py", line 1592, in _shutdown | |
| atexit_call() | |
| File "/usr/local/lib/python3.12/dist-packages/mpi4py/futures/_lib.py", line 121, in join_threads | |
| thread.join() | |
| File "/usr/lib/python3.12/threading.py", line 1147, in join | |
| self._wait_for_tstate_lock() | |
| File "/usr/lib/python3.12/threading.py", line 1167, in _wait_for_tstate_lock | |
| if lock.acquire(block, timeout): | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| KeyboardInterrupt: | |
| ^C^CException ignored in atexit callback: <bound method GenerationExecutorProxy.shutdown of <tensorrt_llm.executor.proxy.GenerationExecutorProxy object at 0x7f7980cd1a00>> | |
| Traceback (most recent call last): | |
| File "/app/tensorrt_llm/TensorRT-LLM/tensorrt_llm/executor/proxy.py", line 368, in shutdown | |
| self.dispatch_stats_thread.join() | |
| File "/usr/lib/python3.12/threading.py", line 1147, in join | |
| self._wait_for_tstate_lock() | |
| File "/usr/lib/python3.12/threading.py", line 1167, in _wait_for_tstate_lock | |
| if lock.acquire(block, timeout): | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| KeyboardInterrupt: | |
| -------------------------------------------------------------------------- | |
| (null) noticed that process rank 0 with PID 0 on node 4316cdb72fd3 exited on signal 6 (Aborted). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment