Skip to content

Instantly share code, notes, and snippets.

@bartowski1182
Created February 23, 2026 17:01
Show Gist options
  • Select an option

  • Save bartowski1182/b7e05f6c96735ec5d03f234d37e11e4d to your computer and use it in GitHub Desktop.

Select an option

Save bartowski1182/b7e05f6c96735ec5d03f234d37e11e4d to your computer and use it in GitHub Desktop.
AFMoE llm compressor
#!/usr/bin/env python3
"""
FP8 Dynamic Quantization for Trinity AFMOE
Usage:
python compress_trinity_fp8.py \
--model_path /path/to/trinity \
--output_path ./trinity-FP8
"""
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
def parse_args():
parser = argparse.ArgumentParser(description="FP8 Dynamic Quantization for Trinity AFMOE")
parser.add_argument("--model_path", type=str, required=True,
help="Path to HF-format Trinity checkpoint")
parser.add_argument("--output_path", type=str, required=True,
help="Output path for quantized model")
return parser.parse_args()
def main():
args = parse_args()
print("=" * 60)
print("Trinity FP8 Dynamic Quantization")
print("=" * 60)
print(f"Model: {args.model_path}")
print(f"Output: {args.output_path}")
print(f"Scheme: FP8_DYNAMIC (FP8 weights + FP8 activations)")
print("=" * 60)
# Load model
print("\n[1/3] Loading model...")
model = AutoModelForCausalLM.from_pretrained(
args.model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
# Build recipe
ignore_patterns = [
"re:.*mlp\\.router\\.gate.*", # Router gates
"lm_head", # Output projection
"re:.*norm.*", # Normalization layers
"re:.*self_attn.*", # Attention layers
]
recipe = QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
ignore=ignore_patterns,
)
# Apply quantization
print("\n[2/3] Applying FP8 quantization...")
oneshot(model=model, recipe=recipe)
# Save model
print("\n[3/3] Saving quantized model...")
model.save_pretrained(args.output_path, save_compressed=True)
tokenizer.save_pretrained(args.output_path)
print("\n" + "=" * 60)
print("Quantization complete!")
print("=" * 60)
print(f"\nQuantized model saved to: {args.output_path}")
print("\nTo use with vLLM:")
print(f' from vllm import LLM')
print(f' model = LLM("{args.output_path}")')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment