Created
January 16, 2026 00:34
-
-
Save vanbasten23/e09f6c5388fade15ee035228d4995fc9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| tests/layers/vllm/test_unquantized.py::test_fused_moe[False-silu-False-2-8-128-1024-8-1-True] FAILED | |
| =================================== FAILURES =================================== | |
| ____________ test_fused_moe[False-silu-False-2-8-128-1024-8-1-True] ____________ | |
| use_ep = True, num_devices = 1, num_tokens = 8, intermediate_size = 1024 | |
| hidden_size = 128, num_experts = 8, topk = 2, has_bias = False | |
| activation = 'silu', enable_attn_dp = False | |
| @pytest.mark.parametrize("use_ep", [True, False]) | |
| @pytest.mark.parametrize("num_devices", [1, jax.local_device_count()]) | |
| @pytest.mark.parametrize("num_tokens", [8]) | |
| @pytest.mark.parametrize("intermediate_size", [1024, 2048]) | |
| @pytest.mark.parametrize("hidden_size", [128, 512]) | |
| @pytest.mark.parametrize("num_experts", [8]) | |
| @pytest.mark.parametrize("topk", [2]) | |
| @pytest.mark.parametrize("has_bias", [False, True]) | |
| @pytest.mark.parametrize("activation", ["silu", "swigluoai"]) | |
| @pytest.mark.parametrize("enable_attn_dp", [False, True]) | |
| def test_fused_moe(use_ep, num_devices, num_tokens, intermediate_size, | |
| hidden_size, num_experts, topk, has_bias, activation, | |
| enable_attn_dp): | |
| # Skip if enable_attn_dp is True but we don't have enough devices | |
| if enable_attn_dp and num_devices < 2: | |
| pytest.skip("enable_attn_dp requires at least 2 devices") | |
| mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp) | |
| torch.manual_seed(42) | |
| dtype = torch.bfloat16 | |
| a = torch.randn((num_tokens, hidden_size), dtype=dtype) / 10 | |
| w1 = torch.randn( | |
| (num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10 | |
| w2 = torch.randn( | |
| (num_experts, hidden_size, intermediate_size), dtype=dtype) / 10 | |
| score = torch.randn((num_tokens, num_experts), dtype=dtype) | |
| w1_bias = w2_bias = None | |
| if has_bias: | |
| w1_bias = torch.randn( | |
| (num_experts, 2 * intermediate_size), dtype=dtype) / 10 | |
| w2_bias = torch.randn((num_experts, hidden_size), dtype=dtype) / 10 | |
| engine_args = EngineArgs( | |
| model="Qwen/Qwen2-1.5B-Instruct", | |
| max_model_len=64, | |
| max_num_batched_tokens=64, | |
| max_num_seqs=4, | |
| ) | |
| vllm_config = engine_args.create_engine_config() | |
| vllm_config.model_config.dtype = dtype | |
| vllm_config.parallel_config = ParallelConfig( | |
| tensor_parallel_size=mesh.devices.size, enable_expert_parallel=use_ep) | |
| quant_config = get_tpu_quantization_config(vllm_config, mesh) | |
| with set_current_vllm_config(vllm_config): | |
| > vllm_fused_moe = FusedMoE( | |
| num_experts=num_experts, | |
| top_k=topk, | |
| hidden_size=hidden_size, | |
| intermediate_size=intermediate_size, | |
| reduce_results=False, | |
| renormalize=False, | |
| tp_size=1, | |
| dp_size=1, | |
| quant_config=quant_config, | |
| has_bias=has_bias, | |
| activation=activation, | |
| ) | |
| tests/layers/vllm/test_unquantized.py:486: | |
| _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | |
| ../vllm/vllm/model_executor/layers/fused_moe/layer.py:620: in __init__ | |
| self.quant_method: FusedMoEMethodBase = _get_quant_method() | |
| ^^^^^^^^^^^^^^^^^^^ | |
| ../vllm/vllm/model_executor/layers/fused_moe/layer.py:612: in _get_quant_method | |
| quant_method = self.quant_config.get_quant_method(self, prefix) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| tpu_inference/layers/vllm/quantization/unquantized.py:87: in get_quant_method | |
| return VllmUnquantizedFusedMoEMethod(moe_config, self.mesh) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| tpu_inference/layers/vllm/quantization/unquantized.py:209: in __init__ | |
| super().__init__(moe) | |
| ../vllm/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:55: in __init__ | |
| self.unquantized_backend = select_unquantized_moe_backend( | |
| _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | |
| use_ep = True, use_dp = False | |
| def select_unquantized_moe_backend( | |
| use_ep: bool, | |
| use_dp: bool, | |
| ) -> UnquantizedMoeBackend: | |
| """ | |
| Select the primary FP8 MoE backend | |
| Note: Shape-specific fallbacks may still occur at runtime. | |
| """ | |
| def _make_log_backend(backend: UnquantizedMoeBackend): | |
| return f"Using {backend.value} backend for Unquantized MoE" | |
| rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() | |
| # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS | |
| flashinfer_cutlass_moe_enabled = ( | |
| has_flashinfer_cutlass_fused_moe() | |
| and envs.VLLM_USE_FLASHINFER_MOE_FP16 | |
| and use_ep | |
| and (not use_dp) | |
| and current_platform.get_device_capability()[0] >= 9 | |
| ) | |
| if current_platform.is_rocm(): | |
| if rocm_aiter_moe_enabled: | |
| backend = UnquantizedMoeBackend.AITER | |
| else: | |
| backend = UnquantizedMoeBackend.TRITON | |
| if current_platform.is_cuda(): | |
| if flashinfer_cutlass_moe_enabled: | |
| backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS | |
| else: | |
| if use_ep and (not use_dp): | |
| logger.info_once( | |
| "FlashInfer CUTLASS MoE is available for EP" | |
| " but not enabled, consider setting" | |
| " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.", | |
| scope="local", | |
| ) | |
| elif use_dp: | |
| logger.info_once( | |
| "FlashInfer CUTLASS MoE is currently not available for DP.", | |
| scope="local", | |
| ) | |
| backend = UnquantizedMoeBackend.TRITON | |
| if current_platform.is_xpu(): | |
| backend = UnquantizedMoeBackend.XPU | |
| if current_platform.is_cpu(): | |
| backend = UnquantizedMoeBackend.CPU | |
| > logger.info_once(_make_log_backend(backend), scope="local") | |
| ^^^^^^^ | |
| E UnboundLocalError: cannot access local variable 'backend' where it is not associated with a value | |
| ../vllm/vllm/model_executor/layers/fused_moe/oracle/unquantized.py:95: UnboundLocalError | |
| =============================== warnings summary =============================== |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment