Created
February 23, 2026 17:01
-
-
Save bartowski1182/b7e05f6c96735ec5d03f234d37e11e4d to your computer and use it in GitHub Desktop.
AFMoE llm compressor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| FP8 Dynamic Quantization for Trinity AFMOE | |
| Usage: | |
| python compress_trinity_fp8.py \ | |
| --model_path /path/to/trinity \ | |
| --output_path ./trinity-FP8 | |
| """ | |
| import argparse | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.quantization import QuantizationModifier | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="FP8 Dynamic Quantization for Trinity AFMOE") | |
| parser.add_argument("--model_path", type=str, required=True, | |
| help="Path to HF-format Trinity checkpoint") | |
| parser.add_argument("--output_path", type=str, required=True, | |
| help="Output path for quantized model") | |
| return parser.parse_args() | |
| def main(): | |
| args = parse_args() | |
| print("=" * 60) | |
| print("Trinity FP8 Dynamic Quantization") | |
| print("=" * 60) | |
| print(f"Model: {args.model_path}") | |
| print(f"Output: {args.output_path}") | |
| print(f"Scheme: FP8_DYNAMIC (FP8 weights + FP8 activations)") | |
| print("=" * 60) | |
| # Load model | |
| print("\n[1/3] Loading model...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| args.model_path, | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=True, | |
| device_map="auto", | |
| low_cpu_mem_usage=True, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(args.model_path) | |
| # Build recipe | |
| ignore_patterns = [ | |
| "re:.*mlp\\.router\\.gate.*", # Router gates | |
| "lm_head", # Output projection | |
| "re:.*norm.*", # Normalization layers | |
| "re:.*self_attn.*", # Attention layers | |
| ] | |
| recipe = QuantizationModifier( | |
| targets="Linear", | |
| scheme="FP8_DYNAMIC", | |
| ignore=ignore_patterns, | |
| ) | |
| # Apply quantization | |
| print("\n[2/3] Applying FP8 quantization...") | |
| oneshot(model=model, recipe=recipe) | |
| # Save model | |
| print("\n[3/3] Saving quantized model...") | |
| model.save_pretrained(args.output_path, save_compressed=True) | |
| tokenizer.save_pretrained(args.output_path) | |
| print("\n" + "=" * 60) | |
| print("Quantization complete!") | |
| print("=" * 60) | |
| print(f"\nQuantized model saved to: {args.output_path}") | |
| print("\nTo use with vLLM:") | |
| print(f' from vllm import LLM') | |
| print(f' model = LLM("{args.output_path}")') | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment