Created
January 23, 2026 04:08
-
-
Save vanbasten23/03a9580d499fa3205ee30762e527b528 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] Traceback (most recent call last): | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/engine/core.py", line 926, in run_engine_core | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/engine/core.py", line 691, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] super().__init__( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/engine/core.py", line 105, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.model_executor = executor_class(vllm_config) | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/executor/abstract.py", line 101, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self._init_executor() | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/executor/uniproc_executor.py", line 48, in _init_executor | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.driver_worker.load_model() | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/worker/tpu_worker.py", line 357, in load_model | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.model_runner.load_model() | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/runner/tpu_runner.py", line 498, in load_model | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.model_fn, self.compute_logits_fn, self.combine_hidden_states_fn, multimodal_fns, self.state, self.lora_manager, self.model = get_model( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/models/common/model_loader.py", line 394, in get_model | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return get_vllm_model(vllm_config, rng, mesh) | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/models/common/model_loader.py", line 354, in get_vllm_model | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] params, lora_manager = model.load_weights() | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/models/vllm/vllm_model_wrapper.py", line 166, in load_weights | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] vllm_model = vllm_get_model(vllm_config=vllm_config_for_load) | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 135, in get_model | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return loader.load_model( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/model_loader/base_loader.py", line 50, in load_model | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] model = initialize_model( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/model_loader/utils.py", line 48, in initialize_model | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return model_class(vllm_config=vllm_config, prefix=prefix) | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 657, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.model = Qwen3MoeModel( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/compilation/decorators.py", line 306, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] old_init(self, **kwargs) | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 410, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.start_layer, self.end_layer, self.layers = make_layers( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/utils.py", line 707, in make_layers | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}")) | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 412, in <lambda> | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config, prefix=prefix), | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 350, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.mlp = Qwen3MoeSparseMoeBlock( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 162, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.experts = FusedMoE( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 603, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.quant_method: FusedMoEMethodBase = _get_quant_method() | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 595, in _get_quant_method | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] quant_method = self.quant_config.get_quant_method(self, prefix) | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py", line 131, in get_quant_method | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return VllmCompressedTensorsMoEMethod.get_moe_method( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 78, in get_moe_method | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return VllmCompressedTensorsW8A8Fp8MoEMethod( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 94, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] super().__init__(weight_quant, input_quant, moe) | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 755, in __init__ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.fp8_backend, self.experts_cls = select_fp8_moe_backend( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/layers/fused_moe/oracle/fp8.py", line 333, in select_fp8_moe_backend | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] raise NotImplementedError( | |
| (EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] NotImplementedError: No FP8 MoE backend supports the deployment configuration. | |
| (EngineCore_DP0 pid=467) Process EngineCore_DP0: | |
| (EngineCore_DP0 pid=467) Traceback (most recent call last): | |
| (EngineCore_DP0 pid=467) File "/usr/local/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
| (EngineCore_DP0 pid=467) self.run() | |
| (EngineCore_DP0 pid=467) File "/usr/local/lib/python3.12/multiprocessing/process.py", line 108, in run | |
| (EngineCore_DP0 pid=467) self._target(*self._args, **self._kwargs) | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/engine/core.py", line 939, in run_engine_core | |
| (EngineCore_DP0 pid=467) raise e | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/engine/core.py", line 926, in run_engine_core | |
| (EngineCore_DP0 pid=467) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/engine/core.py", line 691, in __init__ | |
| (EngineCore_DP0 pid=467) super().__init__( | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/engine/core.py", line 105, in __init__ | |
| (EngineCore_DP0 pid=467) self.model_executor = executor_class(vllm_config) | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/executor/abstract.py", line 101, in __init__ | |
| (EngineCore_DP0 pid=467) self._init_executor() | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/executor/uniproc_executor.py", line 48, in _init_executor | |
| (EngineCore_DP0 pid=467) self.driver_worker.load_model() | |
| (EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/worker/tpu_worker.py", line 357, in load_model | |
| (EngineCore_DP0 pid=467) self.model_runner.load_model() | |
| (EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/runner/tpu_runner.py", line 498, in load_model | |
| (EngineCore_DP0 pid=467) self.model_fn, self.compute_logits_fn, self.combine_hidden_states_fn, multimodal_fns, self.state, self.lora_manager, self.model = get_model( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/models/common/model_loader.py", line 394, in get_model | |
| (EngineCore_DP0 pid=467) return get_vllm_model(vllm_config, rng, mesh) | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/models/common/model_loader.py", line 354, in get_vllm_model | |
| (EngineCore_DP0 pid=467) params, lora_manager = model.load_weights() | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/models/vllm/vllm_model_wrapper.py", line 166, in load_weights | |
| (EngineCore_DP0 pid=467) vllm_model = vllm_get_model(vllm_config=vllm_config_for_load) | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 135, in get_model | |
| (EngineCore_DP0 pid=467) return loader.load_model( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/model_loader/base_loader.py", line 50, in load_model | |
| (EngineCore_DP0 pid=467) model = initialize_model( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/model_loader/utils.py", line 48, in initialize_model | |
| (EngineCore_DP0 pid=467) return model_class(vllm_config=vllm_config, prefix=prefix) | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 657, in __init__ | |
| (EngineCore_DP0 pid=467) self.model = Qwen3MoeModel( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/compilation/decorators.py", line 306, in __init__ | |
| (EngineCore_DP0 pid=467) old_init(self, **kwargs) | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 410, in __init__ | |
| (EngineCore_DP0 pid=467) self.start_layer, self.end_layer, self.layers = make_layers( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/utils.py", line 707, in make_layers | |
| (EngineCore_DP0 pid=467) maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}")) | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 412, in <lambda> | |
| (EngineCore_DP0 pid=467) lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config, prefix=prefix), | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 350, in __init__ | |
| (EngineCore_DP0 pid=467) self.mlp = Qwen3MoeSparseMoeBlock( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 162, in __init__ | |
| (EngineCore_DP0 pid=467) self.experts = FusedMoE( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 603, in __init__ | |
| (EngineCore_DP0 pid=467) self.quant_method: FusedMoEMethodBase = _get_quant_method() | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 595, in _get_quant_method | |
| (EngineCore_DP0 pid=467) quant_method = self.quant_config.get_quant_method(self, prefix) | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py", line 131, in get_quant_method | |
| (EngineCore_DP0 pid=467) return VllmCompressedTensorsMoEMethod.get_moe_method( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 78, in get_moe_method | |
| (EngineCore_DP0 pid=467) return VllmCompressedTensorsW8A8Fp8MoEMethod( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 94, in __init__ | |
| (EngineCore_DP0 pid=467) super().__init__(weight_quant, input_quant, moe) | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 755, in __init__ | |
| (EngineCore_DP0 pid=467) self.fp8_backend, self.experts_cls = select_fp8_moe_backend( | |
| (EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/layers/fused_moe/oracle/fp8.py", line 333, in select_fp8_moe_backend | |
| (EngineCore_DP0 pid=467) raise NotImplementedError( | |
| (EngineCore_DP0 pid=467) NotImplementedError: No FP8 MoE backend supports the deployment configuration. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment