Skip to content

Instantly share code, notes, and snippets.

@vanbasten23
Created January 23, 2026 04:08
Show Gist options
  • Select an option

  • Save vanbasten23/03a9580d499fa3205ee30762e527b528 to your computer and use it in GitHub Desktop.

Select an option

Save vanbasten23/03a9580d499fa3205ee30762e527b528 to your computer and use it in GitHub Desktop.
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] Traceback (most recent call last):
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/engine/core.py", line 926, in run_engine_core
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/engine/core.py", line 691, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] super().__init__(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/engine/core.py", line 105, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.model_executor = executor_class(vllm_config)
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/executor/abstract.py", line 101, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self._init_executor()
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/v1/executor/uniproc_executor.py", line 48, in _init_executor
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.driver_worker.load_model()
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/worker/tpu_worker.py", line 357, in load_model
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.model_runner.load_model()
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/runner/tpu_runner.py", line 498, in load_model
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.model_fn, self.compute_logits_fn, self.combine_hidden_states_fn, multimodal_fns, self.state, self.lora_manager, self.model = get_model(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/models/common/model_loader.py", line 394, in get_model
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return get_vllm_model(vllm_config, rng, mesh)
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/models/common/model_loader.py", line 354, in get_vllm_model
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] params, lora_manager = model.load_weights()
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/models/vllm/vllm_model_wrapper.py", line 166, in load_weights
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] vllm_model = vllm_get_model(vllm_config=vllm_config_for_load)
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 135, in get_model
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return loader.load_model(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/model_loader/base_loader.py", line 50, in load_model
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] model = initialize_model(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/model_loader/utils.py", line 48, in initialize_model
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return model_class(vllm_config=vllm_config, prefix=prefix)
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 657, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.model = Qwen3MoeModel(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/compilation/decorators.py", line 306, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] old_init(self, **kwargs)
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 410, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.start_layer, self.end_layer, self.layers = make_layers(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/utils.py", line 707, in make_layers
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 412, in <lambda>
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config, prefix=prefix),
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 350, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.mlp = Qwen3MoeSparseMoeBlock(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 162, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.experts = FusedMoE(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 603, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.quant_method: FusedMoEMethodBase = _get_quant_method()
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 595, in _get_quant_method
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] quant_method = self.quant_config.get_quant_method(self, prefix)
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py", line 131, in get_quant_method
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return VllmCompressedTensorsMoEMethod.get_moe_method(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 78, in get_moe_method
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] return VllmCompressedTensorsW8A8Fp8MoEMethod(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 94, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] super().__init__(weight_quant, input_quant, moe)
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 755, in __init__
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] self.fp8_backend, self.experts_cls = select_fp8_moe_backend(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] ^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] File "/workspace/vllm/vllm/model_executor/layers/fused_moe/oracle/fp8.py", line 333, in select_fp8_moe_backend
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] raise NotImplementedError(
(EngineCore_DP0 pid=467) ERROR 01-23 03:35:58 [core.py:935] NotImplementedError: No FP8 MoE backend supports the deployment configuration.
(EngineCore_DP0 pid=467) Process EngineCore_DP0:
(EngineCore_DP0 pid=467) Traceback (most recent call last):
(EngineCore_DP0 pid=467) File "/usr/local/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=467) self.run()
(EngineCore_DP0 pid=467) File "/usr/local/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=467) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/engine/core.py", line 939, in run_engine_core
(EngineCore_DP0 pid=467) raise e
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/engine/core.py", line 926, in run_engine_core
(EngineCore_DP0 pid=467) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/engine/core.py", line 691, in __init__
(EngineCore_DP0 pid=467) super().__init__(
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/engine/core.py", line 105, in __init__
(EngineCore_DP0 pid=467) self.model_executor = executor_class(vllm_config)
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/executor/abstract.py", line 101, in __init__
(EngineCore_DP0 pid=467) self._init_executor()
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/v1/executor/uniproc_executor.py", line 48, in _init_executor
(EngineCore_DP0 pid=467) self.driver_worker.load_model()
(EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/worker/tpu_worker.py", line 357, in load_model
(EngineCore_DP0 pid=467) self.model_runner.load_model()
(EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/runner/tpu_runner.py", line 498, in load_model
(EngineCore_DP0 pid=467) self.model_fn, self.compute_logits_fn, self.combine_hidden_states_fn, multimodal_fns, self.state, self.lora_manager, self.model = get_model(
(EngineCore_DP0 pid=467) ^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/models/common/model_loader.py", line 394, in get_model
(EngineCore_DP0 pid=467) return get_vllm_model(vllm_config, rng, mesh)
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/models/common/model_loader.py", line 354, in get_vllm_model
(EngineCore_DP0 pid=467) params, lora_manager = model.load_weights()
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/models/vllm/vllm_model_wrapper.py", line 166, in load_weights
(EngineCore_DP0 pid=467) vllm_model = vllm_get_model(vllm_config=vllm_config_for_load)
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 135, in get_model
(EngineCore_DP0 pid=467) return loader.load_model(
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/model_loader/base_loader.py", line 50, in load_model
(EngineCore_DP0 pid=467) model = initialize_model(
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/model_loader/utils.py", line 48, in initialize_model
(EngineCore_DP0 pid=467) return model_class(vllm_config=vllm_config, prefix=prefix)
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 657, in __init__
(EngineCore_DP0 pid=467) self.model = Qwen3MoeModel(
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/compilation/decorators.py", line 306, in __init__
(EngineCore_DP0 pid=467) old_init(self, **kwargs)
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 410, in __init__
(EngineCore_DP0 pid=467) self.start_layer, self.end_layer, self.layers = make_layers(
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/utils.py", line 707, in make_layers
(EngineCore_DP0 pid=467) maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 412, in <lambda>
(EngineCore_DP0 pid=467) lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config, prefix=prefix),
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 350, in __init__
(EngineCore_DP0 pid=467) self.mlp = Qwen3MoeSparseMoeBlock(
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/models/qwen3_moe.py", line 162, in __init__
(EngineCore_DP0 pid=467) self.experts = FusedMoE(
(EngineCore_DP0 pid=467) ^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 603, in __init__
(EngineCore_DP0 pid=467) self.quant_method: FusedMoEMethodBase = _get_quant_method()
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/layers/fused_moe/layer.py", line 595, in _get_quant_method
(EngineCore_DP0 pid=467) quant_method = self.quant_config.get_quant_method(self, prefix)
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py", line 131, in get_quant_method
(EngineCore_DP0 pid=467) return VllmCompressedTensorsMoEMethod.get_moe_method(
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 78, in get_moe_method
(EngineCore_DP0 pid=467) return VllmCompressedTensorsW8A8Fp8MoEMethod(
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/tpu_inference/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py", line 94, in __init__
(EngineCore_DP0 pid=467) super().__init__(weight_quant, input_quant, moe)
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 755, in __init__
(EngineCore_DP0 pid=467) self.fp8_backend, self.experts_cls = select_fp8_moe_backend(
(EngineCore_DP0 pid=467) ^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=467) File "/workspace/vllm/vllm/model_executor/layers/fused_moe/oracle/fp8.py", line 333, in select_fp8_moe_backend
(EngineCore_DP0 pid=467) raise NotImplementedError(
(EngineCore_DP0 pid=467) NotImplementedError: No FP8 MoE backend supports the deployment configuration.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment