vanbasten23/gist:e09f6c5388fade15ee035228d4995fc9

## gistfile1.txt
tests/layers/vllm/test_unquantized.py::test_fused_moe[False-silu-False-2-8-128-1024-8-1-True] FAILED
=================================== FAILURES ===================================
____________ test_fused_moe[False-silu-False-2-8-128-1024-8-1-True] ____________
use_ep = True, num_devices = 1, num_tokens = 8, intermediate_size = 1024
hidden_size = 128, num_experts = 8, topk = 2, has_bias = False
activation = 'silu', enable_attn_dp = False
    @pytest.mark.parametrize("use_ep", [True, False])
    @pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
    @pytest.mark.parametrize("num_tokens", [8])
    @pytest.mark.parametrize("intermediate_size", [1024, 2048])
    @pytest.mark.parametrize("hidden_size", [128, 512])
    @pytest.mark.parametrize("num_experts", [8])
    @pytest.mark.parametrize("topk", [2])
    @pytest.mark.parametrize("has_bias", [False, True])
    @pytest.mark.parametrize("activation", ["silu", "swigluoai"])
    @pytest.mark.parametrize("enable_attn_dp", [False, True])
    def test_fused_moe(use_ep, num_devices, num_tokens, intermediate_size,
                       hidden_size, num_experts, topk, has_bias, activation,
                       enable_attn_dp):
        # Skip if enable_attn_dp is True but we don't have enough devices
        if enable_attn_dp and num_devices < 2:
            pytest.skip("enable_attn_dp requires at least 2 devices")
        mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
        torch.manual_seed(42)
        dtype = torch.bfloat16
        a = torch.randn((num_tokens, hidden_size), dtype=dtype) / 10
        w1 = torch.randn(
            (num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10
        w2 = torch.randn(
            (num_experts, hidden_size, intermediate_size), dtype=dtype) / 10
        score = torch.randn((num_tokens, num_experts), dtype=dtype)
        w1_bias = w2_bias = None
        if has_bias:
            w1_bias = torch.randn(
                (num_experts, 2 * intermediate_size), dtype=dtype) / 10
            w2_bias = torch.randn((num_experts, hidden_size), dtype=dtype) / 10
        engine_args = EngineArgs(
            model="Qwen/Qwen2-1.5B-Instruct",
            max_model_len=64,
            max_num_batched_tokens=64,
            max_num_seqs=4,
        )
        vllm_config = engine_args.create_engine_config()
        vllm_config.model_config.dtype = dtype
        vllm_config.parallel_config = ParallelConfig(
            tensor_parallel_size=mesh.devices.size, enable_expert_parallel=use_ep)
        quant_config = get_tpu_quantization_config(vllm_config, mesh)
        with set_current_vllm_config(vllm_config):
>           vllm_fused_moe = FusedMoE(
                num_experts=num_experts,
                top_k=topk,
                hidden_size=hidden_size,
                intermediate_size=intermediate_size,
                reduce_results=False,
                renormalize=False,
                tp_size=1,
                dp_size=1,
                quant_config=quant_config,
                has_bias=has_bias,
                activation=activation,
            )
tests/layers/vllm/test_unquantized.py:486:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../vllm/vllm/model_executor/layers/fused_moe/layer.py:620: in __init__
    self.quant_method: FusedMoEMethodBase = _get_quant_method()
                                            ^^^^^^^^^^^^^^^^^^^
../vllm/vllm/model_executor/layers/fused_moe/layer.py:612: in _get_quant_method
    quant_method = self.quant_config.get_quant_method(self, prefix)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tpu_inference/layers/vllm/quantization/unquantized.py:87: in get_quant_method
    return VllmUnquantizedFusedMoEMethod(moe_config, self.mesh)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tpu_inference/layers/vllm/quantization/unquantized.py:209: in __init__
    super().__init__(moe)
../vllm/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:55: in __init__
    self.unquantized_backend = select_unquantized_moe_backend(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
use_ep = True, use_dp = False
    def select_unquantized_moe_backend(
        use_ep: bool,
        use_dp: bool,
    ) -> UnquantizedMoeBackend:
        """
        Select the primary FP8 MoE backend
        Note: Shape-specific fallbacks may still occur at runtime.
        """
        def _make_log_backend(backend: UnquantizedMoeBackend):
            return f"Using {backend.value} backend for Unquantized MoE"
        rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
        # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
        flashinfer_cutlass_moe_enabled = (
            has_flashinfer_cutlass_fused_moe()
            and envs.VLLM_USE_FLASHINFER_MOE_FP16
            and use_ep
            and (not use_dp)
            and current_platform.get_device_capability()[0] >= 9
        )
        if current_platform.is_rocm():
            if rocm_aiter_moe_enabled:
                backend = UnquantizedMoeBackend.AITER
            else:
                backend = UnquantizedMoeBackend.TRITON
        if current_platform.is_cuda():
            if flashinfer_cutlass_moe_enabled:
                backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS
            else:
                if use_ep and (not use_dp):
                    logger.info_once(
                        "FlashInfer CUTLASS MoE is available for EP"
                        " but not enabled, consider setting"
                        " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
                        scope="local",
                    )
                elif use_dp:
                    logger.info_once(
                        "FlashInfer CUTLASS MoE is currently not available for DP.",
                        scope="local",
                    )
                backend = UnquantizedMoeBackend.TRITON
        if current_platform.is_xpu():
            backend = UnquantizedMoeBackend.XPU
        if current_platform.is_cpu():
            backend = UnquantizedMoeBackend.CPU
>       logger.info_once(_make_log_backend(backend), scope="local")
                                           ^^^^^^^
E       UnboundLocalError: cannot access local variable 'backend' where it is not associated with a value
../vllm/vllm/model_executor/layers/fused_moe/oracle/unquantized.py:95: UnboundLocalError
=============================== warnings summary ===============================
	tests/layers/vllm/test_unquantized.py::test_fused_moe[False-silu-False-2-8-128-1024-8-1-True] FAILED
	=================================== FAILURES ===================================
	____________ test_fused_moe[False-silu-False-2-8-128-1024-8-1-True] ____________
	use_ep = True, num_devices = 1, num_tokens = 8, intermediate_size = 1024
	hidden_size = 128, num_experts = 8, topk = 2, has_bias = False
	activation = 'silu', enable_attn_dp = False
	@pytest.mark.parametrize("use_ep", [True, False])
	@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
	@pytest.mark.parametrize("num_tokens", [8])
	@pytest.mark.parametrize("intermediate_size", [1024, 2048])
	@pytest.mark.parametrize("hidden_size", [128, 512])
	@pytest.mark.parametrize("num_experts", [8])
	@pytest.mark.parametrize("topk", [2])
	@pytest.mark.parametrize("has_bias", [False, True])
	@pytest.mark.parametrize("activation", ["silu", "swigluoai"])
	@pytest.mark.parametrize("enable_attn_dp", [False, True])
	def test_fused_moe(use_ep, num_devices, num_tokens, intermediate_size,
	hidden_size, num_experts, topk, has_bias, activation,
	enable_attn_dp):
	# Skip if enable_attn_dp is True but we don't have enough devices
	if enable_attn_dp and num_devices < 2:
	pytest.skip("enable_attn_dp requires at least 2 devices")
	mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
	torch.manual_seed(42)
	dtype = torch.bfloat16
	a = torch.randn((num_tokens, hidden_size), dtype=dtype) / 10
	w1 = torch.randn(
	(num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10
	w2 = torch.randn(
	(num_experts, hidden_size, intermediate_size), dtype=dtype) / 10
	score = torch.randn((num_tokens, num_experts), dtype=dtype)
	w1_bias = w2_bias = None
	if has_bias:
	w1_bias = torch.randn(
	(num_experts, 2 * intermediate_size), dtype=dtype) / 10
	w2_bias = torch.randn((num_experts, hidden_size), dtype=dtype) / 10
	engine_args = EngineArgs(
	model="Qwen/Qwen2-1.5B-Instruct",
	max_model_len=64,
	max_num_batched_tokens=64,
	max_num_seqs=4,
	)
	vllm_config = engine_args.create_engine_config()
	vllm_config.model_config.dtype = dtype
	vllm_config.parallel_config = ParallelConfig(
	tensor_parallel_size=mesh.devices.size, enable_expert_parallel=use_ep)
	quant_config = get_tpu_quantization_config(vllm_config, mesh)
	with set_current_vllm_config(vllm_config):
	> vllm_fused_moe = FusedMoE(
	num_experts=num_experts,
	top_k=topk,
	hidden_size=hidden_size,
	intermediate_size=intermediate_size,
	reduce_results=False,
	renormalize=False,
	tp_size=1,
	dp_size=1,
	quant_config=quant_config,
	has_bias=has_bias,
	activation=activation,
	)
	tests/layers/vllm/test_unquantized.py:486:
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
	../vllm/vllm/model_executor/layers/fused_moe/layer.py:620: in __init__
	self.quant_method: FusedMoEMethodBase = _get_quant_method()
	^^^^^^^^^^^^^^^^^^^
	../vllm/vllm/model_executor/layers/fused_moe/layer.py:612: in _get_quant_method
	quant_method = self.quant_config.get_quant_method(self, prefix)
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	tpu_inference/layers/vllm/quantization/unquantized.py:87: in get_quant_method
	return VllmUnquantizedFusedMoEMethod(moe_config, self.mesh)
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	tpu_inference/layers/vllm/quantization/unquantized.py:209: in __init__
	super().__init__(moe)
	../vllm/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:55: in __init__
	self.unquantized_backend = select_unquantized_moe_backend(
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
	use_ep = True, use_dp = False
	def select_unquantized_moe_backend(
	use_ep: bool,
	use_dp: bool,
	) -> UnquantizedMoeBackend:
	"""
	Select the primary FP8 MoE backend
	Note: Shape-specific fallbacks may still occur at runtime.
	"""
	def _make_log_backend(backend: UnquantizedMoeBackend):
	return f"Using {backend.value} backend for Unquantized MoE"
	rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
	# FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
	flashinfer_cutlass_moe_enabled = (
	has_flashinfer_cutlass_fused_moe()
	and envs.VLLM_USE_FLASHINFER_MOE_FP16
	and use_ep
	and (not use_dp)
	and current_platform.get_device_capability()[0] >= 9
	)
	if current_platform.is_rocm():
	if rocm_aiter_moe_enabled:
	backend = UnquantizedMoeBackend.AITER
	else:
	backend = UnquantizedMoeBackend.TRITON
	if current_platform.is_cuda():
	if flashinfer_cutlass_moe_enabled:
	backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS
	else:
	if use_ep and (not use_dp):
	logger.info_once(
	"FlashInfer CUTLASS MoE is available for EP"
	" but not enabled, consider setting"
	" VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
	scope="local",
	)
	elif use_dp:
	logger.info_once(
	"FlashInfer CUTLASS MoE is currently not available for DP.",
	scope="local",
	)
	backend = UnquantizedMoeBackend.TRITON
	if current_platform.is_xpu():
	backend = UnquantizedMoeBackend.XPU
	if current_platform.is_cpu():
	backend = UnquantizedMoeBackend.CPU
	> logger.info_once(_make_log_backend(backend), scope="local")
	^^^^^^^
	E UnboundLocalError: cannot access local variable 'backend' where it is not associated with a value
	../vllm/vllm/model_executor/layers/fused_moe/oracle/unquantized.py:95: UnboundLocalError
	=============================== warnings summary ===============================
No results found