kiya00/num_requests_1000

## num_requests_1000
root@4316cdb72fd3:/app/tensorrt_llm/TensorRT-LLM# python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $MODEL_ID token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 > /tmp/synthetic_128_128.txt
root@4316cdb72fd3:/app/tensorrt_llm/TensorRT-LLM# trtllm-bench --model $MODEL_ID throughput --dataset /tmp/synthetic_128_128.txt --backend _autodeploy
2025-06-13 15:20:14,791 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend
[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc1
[06/13/2025-15:20:15] [TRT-LLM] [I] Preparing to run throughput benchmark...
Parse safetensors files:   0%|                                                                                     | 0/4 [00:00<?, ?it/Parse safetensors files:  25%|███████████████████▎                                                         | 1/4 [00:00<00:00,  6.78it/Parse safetensors files: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 24.34it/s]
[06/13/2025-15:20:17] [TRT-LLM] [I]
===========================================================
= DATASET DETAILS
===========================================================
Dataset Path:         /tmp/synthetic_128_128.txt
Number of Sequences:  1000

-- Percentiles statistics ---------------------------------

        Input              Output           Seq. Length
-----------------------------------------------------------
MIN:   128.0000           128.0000           256.0000
MAX:   128.0000           128.0000           256.0000
AVG:   128.0000           128.0000           256.0000
P50:   128.0000           128.0000           256.0000
P90:   128.0000           128.0000           256.0000
P95:   128.0000           128.0000           256.0000
P99:   128.0000           128.0000           256.0000
===========================================================

Parse safetensors files:   0%|                                                                                     | 0/4 [00:00<?, ?it/Parse safetensors files:  25%|███████████████████▎                                                         | 1/4 [00:00<00:00,  9.22it/Parse safetensors files: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.69it/Parse safetensors files: 100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.73it/s]
[06/13/2025-15:20:18] [TRT-LLM] [I] Validating KV Cache config against kv_cache_dtype="auto"
[06/13/2025-15:20:18] [TRT-LLM] [I] KV cache quantization set to "auto". Using checkpoint KV quantization.
[06/13/2025-15:20:18] [TRT-LLM] [I] Estimated engine size: 14.96 GB
[06/13/2025-15:20:18] [TRT-LLM] [I] Estimated total available memory for KV cache: 64.69 GB
[06/13/2025-15:20:18] [TRT-LLM] [I] Estimated total KV cache memory: 61.46 GB
[06/13/2025-15:20:18] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 1966.57
[06/13/2025-15:20:18] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 2048
[06/13/2025-15:20:18] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 4096
[06/13/2025-15:20:18] [TRT-LLM] [I] Max batch size and max num tokens not provided. Using heuristics or pre-defined settings: max_batch_size=2048, max_num_tokens=4096.
[06/13/2025-15:20:18] [TRT-LLM] [I] Setting PyTorch max sequence length to 256
[06/13/2025-15:20:18] [TRT-LLM] [I] Setting up throughput benchmark.
[06/13/2025-15:20:18] [TRT-LLM] [W] Using default gpus_per_node: 8
[06/13/2025-15:20:18] [TRT-LLM] [I] Set nccl_plugin to None.
[06/13/2025-15:20:18] [TRT-LLM] [I] model='meta-llama/Llama-3.1-8B' tokenizer=None tokenizer_mode='auto' skip_tokenizer_init=True trust_remote_code=True tensor_parallel_size=1 dtype='auto' revision=None tokenizer_revision=None pipeline_parallel_size=1 context_parallel_size=1 gpus_per_node=8 moe_cluster_parallel_size=-1 moe_tensor_parallel_size=-1 moe_expert_parallel_size=-1 enable_attention_dp=False cp_config={} load_format=<LoadFormat.AUTO: 0> enable_lora=False max_lora_rank=None max_loras=4 max_cpu_loras=4 lora_config=None enable_prompt_adapter=False max_prompt_adapter_token=0 quant_config=QuantConfig(quant_algo=None, kv_cache_quant_algo=None, group_size=128, smoothquant_val=0.5, clamp_val=None, use_meta_recipe=False, has_zero_point=False, pre_quant_scale=False, exclude_modules=None) kv_cache_config=KvCacheConfig(enable_block_reuse=False, max_tokens=None, max_attention_window=None, sink_token_length=None, free_gpu_memory_fraction=0.9, host_cache_size=None, onboard_blocks=True, cross_kv_cache_fraction=None, secondary_offload_min_priority=None, event_buffer_max_size=0, enable_partial_reuse=True, copy_on_partial_reuse=True) enable_chunked_prefill=False guided_decoding_backend=None batched_logits_processor=None iter_stats_max_iterations=None request_stats_max_iterations=None peft_cache_config=None scheduler_config=SchedulerConfig(capacity_scheduler_policy=<CapacitySchedulerPolicy.GUARANTEED_NO_EVICT: 'GUARANTEED_NO_EVICT'>, context_chunking_policy=None, dynamic_batch_config=DynamicBatchConfig(enable_batch_size_tuning=True, enable_max_num_tokens_tuning=False, dynamic_batch_moving_average_window=128)) cache_transceiver_config=None speculative_config=None batching_type=<BatchingType.INFLIGHT: 'INFLIGHT'> normalize_log_probs=False max_batch_size=2048 max_input_len=1024 max_seq_len=256 max_beam_width=1 max_num_tokens=4096 backend='_autodeploy' gather_generation_logits=False num_postprocess_workers=0 postprocess_tokenizer_dir=None reasoning_parser=None decoding_config=None mpi_session=None build_config=BuildConfig(max_input_len=1024, max_seq_len=None, opt_batch_size=8, max_batch_size=2048, max_beam_width=1, max_num_tokens=4096, opt_num_tokens=None, max_prompt_embedding_table_size=0, kv_cache_type=None, gather_context_logits=False, gather_generation_logits=False, strongly_typed=True, force_num_profiles=None, profiling_verbosity='layer_names_only', enable_debug_output=False, max_draft_len=0, speculative_decoding_mode=<SpeculativeDecodingMode.NONE: 1>, use_refit=False, input_timing_cache=None, output_timing_cache='model.cache', lora_config=LoraConfig(lora_dir=[], lora_ckpt_source='hf', max_lora_rank=64, lora_target_modules=[], trtllm_modules_to_hf_modules={}, max_loras=4, max_cpu_loras=4), auto_parallel_config=AutoParallelConfig(world_size=1, gpus_per_node=8, cluster_key=None, cluster_info=None, sharding_cost_model=<CostModel.ALPHA_BETA: 'alpha_beta'>, comm_cost_model=<CostModel.ALPHA_BETA: 'alpha_beta'>, enable_pipeline_parallelism=False, enable_shard_unbalanced_shape=False, enable_shard_dynamic_shape=False, enable_reduce_scatter=True, builder_flags=None, debug_mode=False, infer_shape=True, validation_mode=False, same_buffer_io={}, same_spec_io={}, sharded_io_allowlist=[], fill_weights=False, parallel_config_cache=None, profile_cache=None, dump_path=None, debug_outputs=[]), weight_sparsity=False, weight_streaming=False, plugin_config=PluginConfig(_dtype='float16', _bert_attention_plugin='auto', _gpt_attention_plugin='auto', _gemm_plugin=None, _explicitly_disable_gemm_plugin=False, _gemm_swiglu_plugin=None, _fp8_rowwise_gemm_plugin=None, _qserve_gemm_plugin=None, _identity_plugin=None, _nccl_plugin=None, _lora_plugin=None, _dora_plugin=False, _weight_only_groupwise_quant_matmul_plugin=None, _weight_only_quant_matmul_plugin=None, _smooth_quant_plugins=True, _smooth_quant_gemm_plugin=None, _layernorm_quantization_plugin=None, _rmsnorm_quantization_plugin=None, _quantize_per_token_plugin=False, _quantize_tensor_plugin=False, _moe_plugin='auto', _mamba_conv1d_plugin='auto', _low_latency_gemm_plugin=None, _low_latency_gemm_swiglu_plugin=None, _gemm_allreduce_plugin=None, _context_fmha=True, _bert_context_fmha_fp32_acc=False, _paged_kv_cache=None, _remove_input_padding=True, _norm_quant_fusion=False, _reduce_fusion=False, _user_buffer=False, _tokens_per_block=32, _use_paged_context_fmha=True, _use_fp8_context_fmha=True, _fuse_fp4_quant=False, _multiple_profiles=False, _paged_state=True, _streamingllm=False, _manage_weights=False, _use_fused_mlp=True, _pp_reduce_scatter=False), use_strip_plan=False, max_encoder_input_len=1024, dry_run=False, visualize_network=None, monitor_memory=False, use_mrope=False) use_cuda_graph=True cuda_graph_batch_sizes=[1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 256, 512, 1024, 2048] cuda_graph_max_batch_size=2048 cuda_graph_padding_enabled=True disable_overlap_scheduler=False moe_max_num_tokens=None moe_load_balancer=None attn_backend='FlashInfer' moe_backend='CUTLASS' mixed_sampler=False enable_trtllm_sampler=False kv_cache_dtype='auto' use_kv_cache=True enable_iter_perf_stats=False enable_iter_req_stats=False print_iter_log=False torch_compile_enabled=True torch_compile_fullgraph=True torch_compile_inductor_enabled=False torch_compile_piecewise_cuda_graph=False torch_compile_enable_userbuffers=True autotuner_enabled=True enable_layerwise_nvtx_marker=False auto_deploy_config=None enable_min_latency=False model_factory='AutoModelForCausalLM' model_kwargs={'max_position_embeddings': 256} mla_backend='MultiHeadLatentAttention' skip_loading_weights=False free_mem_ratio=0.8 simple_shard_only=False attn_page_size=64 checkpoint_device=None extended_runtime_perf_knob_config=ExtendedRuntimePerfKnobConfig(multi_block_mode=True, enable_context_fmha_fp32_acc=False, cuda_graph_mode=True, cuda_graph_cache_size=1000) parallel_config=_ParallelConfig(tp_size=1, pp_size=1, cp_size=1, gpus_per_node=8, moe_cluster_size=-1, moe_tp_size=-1, moe_ep_size=-1, cp_config={}, enable_attention_dp=False, auto_parallel=False, _world_size=1, _devices=None) model_format=<_ModelFormatKind.HF: 0> speculative_model=None
rank 0 using MpiPoolSession to spawn MPI processes
[06/13/2025-15:20:18] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue
[06/13/2025-15:20:18] [TRT-LLM] [I] Generating a new HMAC key for server worker_init_status_queue
[06/13/2025-15:20:18] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue
[06/13/2025-15:20:18] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue
[06/13/2025-15:20:18] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue
2025-06-13 15:20:26,897 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend
[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc1
[TensorRT-LLM][INFO] Refreshed the MPI local session
[06/13/2025-15:20:27] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Initializing for: lib='OMPI', local_rank=0, world_size=1, port=38295
[06/13/2025-15:20:27] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] max_seq_len=256, max_batch_size=2048, attn_page_size=64, max_num_tokens=4096
/root/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B/snapshots/d04e592bb4f6aa9cfee91e2e20afa771667e1d4b
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
[06/13/2025-15:20:34] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] No quantization to do.
[06/13/2025-15:20:34] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 MoE Patterns
[06/13/2025-15:20:34] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 64 repeat_kv patterns
[06/13/2025-15:20:34] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 eager attention patterns
[06/13/2025-15:20:34] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 grouped attention patterns
[06/13/2025-15:20:34] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 causal mask attention patterns
[06/13/2025-15:20:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found and matched 32 attention layouts
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
[06/13/2025-15:20:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found and matched 32 RoPE patterns
[06/13/2025-15:20:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Match RoPE layout to bsnd
[06/13/2025-15:20:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 RoPE layout matches
[06/13/2025-15:20:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found and eliminated 192 redundant transpose pairs
[06/13/2025-15:20:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 32 RoPE optimizations
[06/13/2025-15:20:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Skipping sharding for single device
[06/13/2025-15:20:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Skipping sharding for single device
[06/13/2025-15:20:35] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Skipping sharding for single device
[06/13/2025-15:20:37] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Loading and initializing weights.
[06/13/2025-15:20:42] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 allreduce+residual+rmsnorm fusions
[06/13/2025-15:20:42] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 0 GEMM+Collective fusions
[06/13/2025-15:20:42] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Starting RMSNorm pattern matching with backend: triton
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
/usr/local/lib/python3.12/dist-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/Context.cpp:148.)
  torch._C._set_onednn_allow_tf32(_allow_tf32)
[06/13/2025-15:20:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] RMSNorm pattern count: 65
[06/13/2025-15:20:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Found 2 input nodes and 1 output nodes
[06/13/2025-15:20:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Added 4 new input nodes for cached attention metadata
[06/13/2025-15:20:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Replaced 32 attention.bsnd_grouped_sdpa ops with attention.flashinfer_mha_with_cache
[06/13/2025-15:20:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Initialized 65 caches for cached attention
[06/13/2025-15:20:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory ratio: 0.8
[06/13/2025-15:20:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory (MB): 59648 , Total memory (MB): 80994
[06/13/2025-15:20:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Current cache size (MB): 512, Current num pages: 64
[06/13/2025-15:20:43] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory before forward pass (MB): 59648
2025-06-13 15:20:44,551 - INFO - flashinfer.jit: Loading JIT ops: rope
2025-06-13 15:20:44,563 - INFO - flashinfer.jit: Finished loading JIT ops: rope
2025-06-13 15:20:44,565 - INFO - flashinfer.jit: Loading JIT ops: page
2025-06-13 15:20:44,574 - INFO - flashinfer.jit: Finished loading JIT ops: page
2025-06-13 15:20:44,579 - INFO - flashinfer.jit: Loading JIT ops: batch_prefill_with_kv_cache_dtype_q_bf16_dtype_kv_bf16_dtype_o_bf16_dtype_idx_i32_head_dim_qk_128_head_dim_vo_128_posenc_0_use_swa_False_use_logits_cap_False_f16qk_False
2025-06-13 15:20:44,589 - INFO - flashinfer.jit: Finished loading JIT ops: batch_prefill_with_kv_cache_dtype_q_bf16_dtype_kv_bf16_dtype_o_bf16_dtype_idx_i32_head_dim_qk_128_head_dim_vo_128_posenc_0_use_swa_False_use_logits_cap_False_f16qk_False
[06/13/2025-15:20:44] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Free memory after forward pass (MB): 59578
[06/13/2025-15:20:44] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Memory for forward pass (MB): 70
[06/13/2025-15:20:48] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] After all_gather - new_num_pages: 6021
[06/13/2025-15:20:48] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Fusion before compiling...
[06/13/2025-15:20:48] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Compiling for torch-opt backend...
[06/13/2025-15:20:52] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 1
[06/13/2025-15:20:56] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 2
[06/13/2025-15:21:00] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 4
[06/13/2025-15:21:04] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 8
[06/13/2025-15:21:08] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 16
[06/13/2025-15:21:11] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 24
[06/13/2025-15:21:15] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 32
[06/13/2025-15:21:19] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 40
[rank0]:W0613 15:21:19.436000 32429 torch/_dynamo/convert_frame.py:961] [0/8] torch._dynamo hit config.recompile_limit (8)
[rank0]:W0613 15:21:19.436000 32429 torch/_dynamo/convert_frame.py:961] [0/8]    function: 'forward' (<eval_with_key>.1480:4)
[rank0]:W0613 15:21:19.436000 32429 torch/_dynamo/convert_frame.py:961] [0/8]    last reason: 0/7: tensor 'L['input_ids']' size mismatch at index 0. expected 32, actual 40
[rank0]:W0613 15:21:19.436000 32429 torch/_dynamo/convert_frame.py:961] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
[rank0]:W0613 15:21:19.436000 32429 torch/_dynamo/convert_frame.py:961] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
[06/13/2025-15:21:19] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 48
[06/13/2025-15:21:20] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 56
[06/13/2025-15:21:20] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 64
[06/13/2025-15:21:21] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 72
[06/13/2025-15:21:21] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 80
[06/13/2025-15:21:21] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 88
[06/13/2025-15:21:22] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 96
[06/13/2025-15:21:22] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 104
[06/13/2025-15:21:22] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 112
[06/13/2025-15:21:23] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 120
[06/13/2025-15:21:23] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 128
[06/13/2025-15:21:24] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 256
[06/13/2025-15:21:24] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 512
[06/13/2025-15:21:24] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 1024
[06/13/2025-15:21:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Capturing graph for batch size: 2048
[06/13/2025-15:21:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Compile time with backend torch-opt: 37.251077 seconds
[06/13/2025-15:21:25] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Using fake cache manager with head_dim=0 and num pages: 6021
[TensorRT-LLM][INFO] Max KV cache pages per sequence: 4 [window size=256]
[TensorRT-LLM][INFO] Number of tokens per block: 64.
[TensorRT-LLM][INFO] [MemUsageChange] Allocated 0.00 GiB for max tokens in paged KV cache (385344).
[06/13/2025-15:21:26] [TRT-LLM] [I] Setting up for warmup...
[06/13/2025-15:21:26] [TRT-LLM] [I] Running warmup.
[06/13/2025-15:21:26] [TRT-LLM] [I] Starting benchmarking async task.
[06/13/2025-15:21:26] [TRT-LLM] [I] Starting benchmark...
[06/13/2025-15:21:26] [TRT-LLM] [I] Request submission complete. [count=2, time=0.0000s, rate=186081.13 req/s]
[06/13/2025-15:21:28] [TRT-LLM] [I] Benchmark complete.
[06/13/2025-15:21:28] [TRT-LLM] [I] Stopping LLM backend.
[06/13/2025-15:21:28] [TRT-LLM] [I] Cancelling all 0 tasks to complete.
[06/13/2025-15:21:28] [TRT-LLM] [I] All tasks cancelled.
[06/13/2025-15:21:28] [TRT-LLM] [I] LLM Backend stopped.
[06/13/2025-15:21:28] [TRT-LLM] [I] Worker task cancelled.
[06/13/2025-15:21:28] [TRT-LLM] [I] Warmup done.
[06/13/2025-15:21:28] [TRT-LLM] [I] No log path provided, skipping logging.
[06/13/2025-15:21:28] [TRT-LLM] [I] Starting benchmarking async task.
[06/13/2025-15:21:28] [TRT-LLM] [I] Starting benchmark...
[06/13/2025-15:21:28] [TRT-LLM] [I] Request submission complete. [count=1000, time=0.0005s, rate=2009634.19 req/s]
[06/13/2025-15:21:37] [TRT-LLM] [I] Benchmark complete.
[06/13/2025-15:21:37] [TRT-LLM] [I] Stopping LLM backend.
[06/13/2025-15:21:37] [TRT-LLM] [I] Cancelling all 0 tasks to complete.
[06/13/2025-15:21:37] [TRT-LLM] [I] All tasks cancelled.
[06/13/2025-15:21:37] [TRT-LLM] [I] LLM Backend stopped.
[06/13/2025-15:21:37] [TRT-LLM] [I] Worker task cancelled.
[06/13/2025-15:21:37] [TRT-LLM] [I] Benchmark done. Reporting results...
[06/13/2025-15:21:37] [TRT-LLM] [I] Validating KV Cache config against kv_cache_dtype="auto"
[06/13/2025-15:21:37] [TRT-LLM] [I] KV cache quantization set to "auto". Using checkpoint KV quantization.
[06/13/2025-15:21:37] [TRT-LLM] [I]

===========================================================
= PYTORCH BACKEND
===========================================================
Model:                  meta-llama/Llama-3.1-8B
Model Path:             None
TensorRT-LLM Version:   0.21.0rc1
Dtype:                  bfloat16
KV Cache Dtype:         None
Quantization:           None

===========================================================
= REQUEST DETAILS
===========================================================
Number of requests:             1000
Number of concurrent requests:  952.1662
Average Input Length (tokens):  128.0000
Average Output Length (tokens): 128.0000
===========================================================
= WORLD + RUNTIME INFORMATION
===========================================================
TP Size:                1
PP Size:                1
EP Size:                None
Max Runtime Batch Size: 2048
Max Runtime Tokens:     4096
Scheduling Policy:      GUARANTEED_NO_EVICT
KV Memory Percentage:   90.00%
Issue Rate (req/sec):   4.5100E+13

===========================================================
= PERFORMANCE OVERVIEW
===========================================================
Request Throughput (req/sec):                     116.1554
Total Output Throughput (tokens/sec):             14867.8898
Total Token Throughput (tokens/sec):              29735.7797
Total Latency (ms):                               8609.1571
Average request latency (ms):                     8197.3482
Per User Output Throughput [w/ ctx] (tps/user):   15.6332
Per GPU Output Throughput (tps/gpu):              14867.8898

-- Request Latency Breakdown (ms) -----------------------

[Latency] P50    : 8253.4719
[Latency] P90    : 8530.1304
[Latency] P95    : 8547.8302
[Latency] P99    : 8557.0758
[Latency] MINIMUM: 7631.9599
[Latency] MAXIMUM: 8564.2734
[Latency] AVERAGE: 8197.3482

===========================================================
= DATASET DETAILS
===========================================================
Dataset Path:         /tmp/synthetic_128_128.txt
Number of Sequences:  1000

-- Percentiles statistics ---------------------------------

        Input              Output           Seq. Length
-----------------------------------------------------------
MIN:   128.0000           128.0000           256.0000
MAX:   128.0000           128.0000           256.0000
AVG:   128.0000           128.0000           256.0000
P50:   128.0000           128.0000           256.0000
P90:   128.0000           128.0000           256.0000
P95:   128.0000           128.0000           256.0000
P99:   128.0000           128.0000           256.0000
===========================================================

[06/13/2025-15:21:37] [TRT-LLM] [I] Thread proxy_dispatch_result_thread stopped.
[06/13/2025-15:21:37] [TRT-LLM] [I] Thread proxy_dispatch_kv_cache_events_thread stopped.
[06/13/2025-15:21:37] [TRT-LLM] [I] Thread proxy_dispatch_stats_thread stopped.
[06/13/2025-15:21:37] [TRT-LLM] [I] Thread await_response_thread stopped.
[06/13/2025-15:21:38] [TRT-LLM] [I] Thread dispatch_stats_thread stopped.
[06/13/2025-15:21:38] [TRT-LLM] [I] Thread dispatch_kv_cache_events_thread stopped.
[06/13/2025-15:21:38] [TRT-LLM AUTO-DEPLOY] [RANK 0] [I] Destroying process group
root@4316cdb72fd3:/app/tensorrt_llm/TensorRT-LLM#
No results found