Skip to content

Instantly share code, notes, and snippets.

@vanbasten23
Created January 16, 2026 00:34
Show Gist options
  • Select an option

  • Save vanbasten23/e09f6c5388fade15ee035228d4995fc9 to your computer and use it in GitHub Desktop.

Select an option

Save vanbasten23/e09f6c5388fade15ee035228d4995fc9 to your computer and use it in GitHub Desktop.
tests/layers/vllm/test_unquantized.py::test_fused_moe[False-silu-False-2-8-128-1024-8-1-True] FAILED
=================================== FAILURES ===================================
____________ test_fused_moe[False-silu-False-2-8-128-1024-8-1-True] ____________
use_ep = True, num_devices = 1, num_tokens = 8, intermediate_size = 1024
hidden_size = 128, num_experts = 8, topk = 2, has_bias = False
activation = 'silu', enable_attn_dp = False
@pytest.mark.parametrize("use_ep", [True, False])
@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
@pytest.mark.parametrize("num_tokens", [8])
@pytest.mark.parametrize("intermediate_size", [1024, 2048])
@pytest.mark.parametrize("hidden_size", [128, 512])
@pytest.mark.parametrize("num_experts", [8])
@pytest.mark.parametrize("topk", [2])
@pytest.mark.parametrize("has_bias", [False, True])
@pytest.mark.parametrize("activation", ["silu", "swigluoai"])
@pytest.mark.parametrize("enable_attn_dp", [False, True])
def test_fused_moe(use_ep, num_devices, num_tokens, intermediate_size,
hidden_size, num_experts, topk, has_bias, activation,
enable_attn_dp):
# Skip if enable_attn_dp is True but we don't have enough devices
if enable_attn_dp and num_devices < 2:
pytest.skip("enable_attn_dp requires at least 2 devices")
mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
torch.manual_seed(42)
dtype = torch.bfloat16
a = torch.randn((num_tokens, hidden_size), dtype=dtype) / 10
w1 = torch.randn(
(num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10
w2 = torch.randn(
(num_experts, hidden_size, intermediate_size), dtype=dtype) / 10
score = torch.randn((num_tokens, num_experts), dtype=dtype)
w1_bias = w2_bias = None
if has_bias:
w1_bias = torch.randn(
(num_experts, 2 * intermediate_size), dtype=dtype) / 10
w2_bias = torch.randn((num_experts, hidden_size), dtype=dtype) / 10
engine_args = EngineArgs(
model="Qwen/Qwen2-1.5B-Instruct",
max_model_len=64,
max_num_batched_tokens=64,
max_num_seqs=4,
)
vllm_config = engine_args.create_engine_config()
vllm_config.model_config.dtype = dtype
vllm_config.parallel_config = ParallelConfig(
tensor_parallel_size=mesh.devices.size, enable_expert_parallel=use_ep)
quant_config = get_tpu_quantization_config(vllm_config, mesh)
with set_current_vllm_config(vllm_config):
> vllm_fused_moe = FusedMoE(
num_experts=num_experts,
top_k=topk,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
reduce_results=False,
renormalize=False,
tp_size=1,
dp_size=1,
quant_config=quant_config,
has_bias=has_bias,
activation=activation,
)
tests/layers/vllm/test_unquantized.py:486:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../vllm/vllm/model_executor/layers/fused_moe/layer.py:620: in __init__
self.quant_method: FusedMoEMethodBase = _get_quant_method()
^^^^^^^^^^^^^^^^^^^
../vllm/vllm/model_executor/layers/fused_moe/layer.py:612: in _get_quant_method
quant_method = self.quant_config.get_quant_method(self, prefix)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tpu_inference/layers/vllm/quantization/unquantized.py:87: in get_quant_method
return VllmUnquantizedFusedMoEMethod(moe_config, self.mesh)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tpu_inference/layers/vllm/quantization/unquantized.py:209: in __init__
super().__init__(moe)
../vllm/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:55: in __init__
self.unquantized_backend = select_unquantized_moe_backend(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
use_ep = True, use_dp = False
def select_unquantized_moe_backend(
use_ep: bool,
use_dp: bool,
) -> UnquantizedMoeBackend:
"""
Select the primary FP8 MoE backend
Note: Shape-specific fallbacks may still occur at runtime.
"""
def _make_log_backend(backend: UnquantizedMoeBackend):
return f"Using {backend.value} backend for Unquantized MoE"
rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
# FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
flashinfer_cutlass_moe_enabled = (
has_flashinfer_cutlass_fused_moe()
and envs.VLLM_USE_FLASHINFER_MOE_FP16
and use_ep
and (not use_dp)
and current_platform.get_device_capability()[0] >= 9
)
if current_platform.is_rocm():
if rocm_aiter_moe_enabled:
backend = UnquantizedMoeBackend.AITER
else:
backend = UnquantizedMoeBackend.TRITON
if current_platform.is_cuda():
if flashinfer_cutlass_moe_enabled:
backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS
else:
if use_ep and (not use_dp):
logger.info_once(
"FlashInfer CUTLASS MoE is available for EP"
" but not enabled, consider setting"
" VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
scope="local",
)
elif use_dp:
logger.info_once(
"FlashInfer CUTLASS MoE is currently not available for DP.",
scope="local",
)
backend = UnquantizedMoeBackend.TRITON
if current_platform.is_xpu():
backend = UnquantizedMoeBackend.XPU
if current_platform.is_cpu():
backend = UnquantizedMoeBackend.CPU
> logger.info_once(_make_log_backend(backend), scope="local")
^^^^^^^
E UnboundLocalError: cannot access local variable 'backend' where it is not associated with a value
../vllm/vllm/model_executor/layers/fused_moe/oracle/unquantized.py:95: UnboundLocalError
=============================== warnings summary ===============================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment