- skip sdpa
{
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2",
"scale_format": "const",
"allowlist": {
"types": [],
"names": []| model_path="/dataset/auto-round/qwen_moe/" | |
| taskname=gsm8k | |
| taskname=longbench_hotpotqa | |
| timestamp=$(date +%Y%m%d_%H%M%S) | |
| model_path="/storage/yiliu7/meta-llama/Llama-3.1-8B-Instruct" | |
| output_log_file_name="${taskname}_${timestamp}" | |
| MAX_MODEL_LEN=40960 | |
| max_length=${MAX_MODEL_LEN} | |
| taskname=gsm8k |
| import os | |
| from functools import wraps | |
| # from vllm import envs | |
| from loguru import logger | |
| def with_thread_limits(): | |
| """ | |
| Decorator to temporarily set OMP_NUM_THREADS and PyTorch threads, | |
| and restore them after the function call. | |
| ## install uv on OS | |
| curl -LsSf https://astral.sh/uv/install.sh | sh | |
| ## create new project | |
| uv init myproj | |
| ## install packages | |
| uv add django requests "pandas>=2.3" | |
| ## remove package |
| diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| index 69c03d8efb8..f3668018c43 100755 | |
| --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| @@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase): | |
| router_weights, | |
| permuted_weights=True, | |
| activation="silu"): | |
| + enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk | |
| + if not enable_moe_chunk: |
| #!/bin/bash | |
| # | |
| # https://docs.docker.com/build/buildkit/ | |
| # https://github.com/docker/buildx/releases/ | |
| # https://github.com/docker/buildx | |
| ## docker builder prune --all | |
| ## docker buildx du --verbose | |
| ## For Ubuntu 24.04 try: sudo apt install docker-buildx |
| diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| index 69c03d8efb8..f3668018c43 100755 | |
| --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py | |
| @@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase): | |
| router_weights, | |
| permuted_weights=True, | |
| activation="silu"): | |
| + enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk | |
| + if enable_moe_chunk: |
| #!/bin/bash | |
| # Check if a model name is passed as an argument, otherwise use the default model path | |
| if [ -z "$1" ]; then | |
| model_path="Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound" | |
| else | |
| model_path="$1" | |
| fi | |
| tp_size=1 | |
| model_name=$(basename ${model_path}) |
| import ctypes | |
| import torch | |
| import time | |
| def nvrtc_compile(source: str) -> str: | |
| from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer | |
| libnvrtc = CDLL('libnvrtc.so') | |
| def get_error_string() -> str: | |
| err_p = c_char_p() | |
| libnvrtc.nvrtcGetErrorString(result, byref(err_str)) |
| Run 1: | |
| Auto-configed device: cuda | |
| WARNING:sglang.srt.server_args:Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel. | |
| WARNING:sglang.srt.server_args:TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from None to 64. | |
| [2025-09-06 08:26:09] server_args=ServerArgs(model_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_path='/home/yiliu7/models/openai/gpt-oss-120b', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='127.0.0.1', port=8400, skip_server_warmup=False, warmups=None, nccl_port=None, dtype='bfloat16', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', mem_fraction_static=0.93, max_running_requests=None, max_queued_requests=9223372036854775807, max_total_tokens=None, chunked_prefill_size=16384, max_p |
{
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2",
"scale_format": "const",
"allowlist": {
"types": [],
"names": []