justinchuby/qwen3_onnx.py

## qwen3_onnx.py
# transformers==4.52.4
# pytorch nightly
from __future__ import annotations

import ast
import logging
import typing
from pathlib import Path

import onnx_ir as ir
import onnxscript.rewriter.ort_fusions
import torch
import torch.onnx.testing
from onnx_ir.passes import PassResult
from onnx_ir.passes.common import ClearMetadataAndDocStringPass
from transformers import AutoModel, AutoTokenizer

logger = logging.getLogger(__name__)

def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def sdpa_attention_forward(
    module: torch.nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: torch.Tensor | None,
    dropout: float = 0.0,
    scaling: float | None = None,
    is_causal: bool | None = None,
    **kwargs,
) -> tuple[torch.Tensor, None]:
    if hasattr(module, "num_key_value_groups"):
        key = repeat_kv(key, module.num_key_value_groups)
        value = repeat_kv(value, module.num_key_value_groups)

    causal_mask = attention_mask

    query = query.contiguous()
    key = key.contiguous()
    value = value.contiguous()

    attn_output = torch.nn.functional.scaled_dot_product_attention(
        query,
        key,
        value,
        attn_mask=causal_mask,
        dropout_p=dropout,
        scale=scaling,
    )
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, None


# Path sdpa attention for transformers
import transformers.integrations.sdpa_attention

transformers.integrations.sdpa_attention.sdpa_attention_forward = sdpa_attention_forward


def _get_scoped_prefix(name_scopes: list[str]) -> str:
    # Remove common prefixes between consecutive scopes
    processed_scopes = []
    for i, scope in enumerate(name_scopes):
        if i == 0:
            processed_scopes.append(scope)
        else:
            prev_scope = name_scopes[i - 1]
            processed_scopes.append(scope.removeprefix(prev_scope).lstrip("."))

    return "/".join(processed_scopes)


class AssignNamesPass(ir.passes.InPlacePass):
    def call(self, model: ir.Model) -> PassResult:
        modified = False
        for node in model.graph.all_nodes():
            if "pkg.torch.onnx.name_scopes" in node.metadata_props:
                name_scopes = typing.cast(
                    "list[str]",
                    ast.literal_eval(node.metadata_props["pkg.torch.onnx.name_scopes"]),
                )
                name_scopes.pop()  # Remove self name
                prefix = _get_scoped_prefix(name_scopes)

                # Rename node
                if prefix:
                    node.name = f"{prefix}/{node.name}"
                    modified = True

                # Rename outputs
                for output in node.outputs:
                    if (
                        not output.is_graph_output()
                        and output.name is not None
                        and output.name != ""
                    ):
                        if prefix:
                            scoped_name = f"{prefix}/{output.name}"
                            logger.debug("Renaming %r to %r", output.name, scoped_name)
                            output.name = scoped_name
                            modified = True
        return PassResult(model, modified)


class Qwen3EmbeddingONNXExporter:
    def __init__(self, model_id="Qwen/Qwen3-Embedding-0.6B"):
        self.model_id = model_id
        self.model = None
        self.tokenizer = None

    def load_model(self):
        """Load the Qwen3 model and tokenizer"""
        print(f"Loading {self.model_id}...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(self.model_id, trust_remote_code=True)
        self.model.eval()
        print("Model loaded successfully!")

    def create_dummy_inputs(self, batch_size=2, seq_length=128):
        """Create dummy inputs for ONNX export"""
        dummy_text = ["This is a sample text for ONNX export"] * batch_size
        inputs = self.tokenizer(
            dummy_text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=seq_length,
        )
        return inputs

    def export_to_onnx(self, output_dir="./qwen3-onnx"):
        """Export model to ONNX format"""
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        # Save tokenizer and config
        print("Saving tokenizer and config...")
        self.tokenizer.save_pretrained(output_dir)

        # Create dummy inputs
        # NOTE(justinchuby): Batch size must be greater than 1 to be captured as dynamic
        dummy_inputs = self.create_dummy_inputs()
        input_ids = dummy_inputs["input_ids"]
        attention_mask = dummy_inputs["attention_mask"]

        # Export to ONNX
        print("Exporting to ONNX...")

        # Wrap the model WITHOUT pooling - TEI will handle pooling
        class ModelWrapper(torch.nn.Module):
            def __init__(self, model):
                super().__init__()
                self.model = model

            def forward(self, input_ids, attention_mask):
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

                if hasattr(outputs, "last_hidden_state"):
                    return outputs.last_hidden_state
                else:
                    return outputs[0]

        wrapped_model = ModelWrapper(self.model)
        wrapped_model.eval()

        onnx_program = torch.onnx.export(
            wrapped_model,
            (input_ids, attention_mask),
            input_names=["input_ids", "attention_mask"],
            output_names=["last_hidden_state"],
            dynamic_shapes={
                "input_ids": {0: "batch", 1: "seq"},
                "attention_mask": {0: "batch", 1: "seq"},
            },
            opset_version=21,
        )

        AssignNamesPass()(onnx_program.model)

        onnx_program.save(output_path / "model_pre_fusion.onnx")

        torch.onnx.testing.assert_onnx_program(onnx_program, atol=1e-4, rtol=1e-4)

        # Optimize for ORT
        model, fusion = onnxscript.rewriter.ort_fusions.optimize_for_ort(onnx_program.model)
        print(fusion)
        # For production, remove metadata:
        result = ClearMetadataAndDocStringPass()(model)
        onnx_program.model = result.model
        onnx_program.save(output_path / "model.onnx")

        torch.onnx.testing.assert_onnx_program(onnx_program, atol=1e-3, rtol=1e-3)


def main():
    exporter = Qwen3EmbeddingONNXExporter()
    exporter.load_model()
    exporter.export_to_onnx(output_dir="./qwen3-onnx-1028")


if __name__ == "__main__":
    main()
	# transformers==4.52.4
	# pytorch nightly
	from __future__ import annotations

	import ast
	import logging
	import typing
	from pathlib import Path

	import onnx_ir as ir
	import onnxscript.rewriter.ort_fusions
	import torch
	import torch.onnx.testing
	from onnx_ir.passes import PassResult
	from onnx_ir.passes.common import ClearMetadataAndDocStringPass
	from transformers import AutoModel, AutoTokenizer

	logger = logging.getLogger(__name__)

	def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""
	This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
	num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
	"""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :, None, :, :].expand(
	batch, num_key_value_heads, n_rep, slen, head_dim
	)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


	def sdpa_attention_forward(
	module: torch.nn.Module,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	attention_mask: torch.Tensor \| None,
	dropout: float = 0.0,
	scaling: float \| None = None,
	is_causal: bool \| None = None,
	**kwargs,
	) -> tuple[torch.Tensor, None]:
	if hasattr(module, "num_key_value_groups"):
	key = repeat_kv(key, module.num_key_value_groups)
	value = repeat_kv(value, module.num_key_value_groups)

	causal_mask = attention_mask

	query = query.contiguous()
	key = key.contiguous()
	value = value.contiguous()

	attn_output = torch.nn.functional.scaled_dot_product_attention(
	query,
	key,
	value,
	attn_mask=causal_mask,
	dropout_p=dropout,
	scale=scaling,
	)
	attn_output = attn_output.transpose(1, 2).contiguous()

	return attn_output, None


	# Path sdpa attention for transformers
	import transformers.integrations.sdpa_attention

	transformers.integrations.sdpa_attention.sdpa_attention_forward = sdpa_attention_forward





	def _get_scoped_prefix(name_scopes: list[str]) -> str:
	# Remove common prefixes between consecutive scopes
	processed_scopes = []
	for i, scope in enumerate(name_scopes):
	if i == 0:
	processed_scopes.append(scope)
	else:
	prev_scope = name_scopes[i - 1]
	processed_scopes.append(scope.removeprefix(prev_scope).lstrip("."))

	return "/".join(processed_scopes)


	class AssignNamesPass(ir.passes.InPlacePass):
	def call(self, model: ir.Model) -> PassResult:
	modified = False
	for node in model.graph.all_nodes():
	if "pkg.torch.onnx.name_scopes" in node.metadata_props:
	name_scopes = typing.cast(
	"list[str]",
	ast.literal_eval(node.metadata_props["pkg.torch.onnx.name_scopes"]),
	)
	name_scopes.pop() # Remove self name
	prefix = _get_scoped_prefix(name_scopes)

	# Rename node
	if prefix:
	node.name = f"{prefix}/{node.name}"
	modified = True

	# Rename outputs
	for output in node.outputs:
	if (
	not output.is_graph_output()
	and output.name is not None
	and output.name != ""
	):
	if prefix:
	scoped_name = f"{prefix}/{output.name}"
	logger.debug("Renaming %r to %r", output.name, scoped_name)
	output.name = scoped_name
	modified = True
	return PassResult(model, modified)


	class Qwen3EmbeddingONNXExporter:
	def __init__(self, model_id="Qwen/Qwen3-Embedding-0.6B"):
	self.model_id = model_id
	self.model = None
	self.tokenizer = None

	def load_model(self):
	"""Load the Qwen3 model and tokenizer"""
	print(f"Loading {self.model_id}...")
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
	self.model = AutoModel.from_pretrained(self.model_id, trust_remote_code=True)
	self.model.eval()
	print("Model loaded successfully!")

	def create_dummy_inputs(self, batch_size=2, seq_length=128):
	"""Create dummy inputs for ONNX export"""
	dummy_text = ["This is a sample text for ONNX export"] * batch_size
	inputs = self.tokenizer(
	dummy_text,
	return_tensors="pt",
	padding="max_length",
	truncation=True,
	max_length=seq_length,
	)
	return inputs

	def export_to_onnx(self, output_dir="./qwen3-onnx"):
	"""Export model to ONNX format"""
	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	# Save tokenizer and config
	print("Saving tokenizer and config...")
	self.tokenizer.save_pretrained(output_dir)

	# Create dummy inputs
	# NOTE(justinchuby): Batch size must be greater than 1 to be captured as dynamic
	dummy_inputs = self.create_dummy_inputs()
	input_ids = dummy_inputs["input_ids"]
	attention_mask = dummy_inputs["attention_mask"]

	# Export to ONNX
	print("Exporting to ONNX...")

	# Wrap the model WITHOUT pooling - TEI will handle pooling
	class ModelWrapper(torch.nn.Module):
	def __init__(self, model):
	super().__init__()
	self.model = model

	def forward(self, input_ids, attention_mask):
	outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

	if hasattr(outputs, "last_hidden_state"):
	return outputs.last_hidden_state
	else:
	return outputs[0]

	wrapped_model = ModelWrapper(self.model)
	wrapped_model.eval()

	onnx_program = torch.onnx.export(
	wrapped_model,
	(input_ids, attention_mask),
	input_names=["input_ids", "attention_mask"],
	output_names=["last_hidden_state"],
	dynamic_shapes={
	"input_ids": {0: "batch", 1: "seq"},
	"attention_mask": {0: "batch", 1: "seq"},
	},
	opset_version=21,
	)

	AssignNamesPass()(onnx_program.model)

	onnx_program.save(output_path / "model_pre_fusion.onnx")

	torch.onnx.testing.assert_onnx_program(onnx_program, atol=1e-4, rtol=1e-4)

	# Optimize for ORT
	model, fusion = onnxscript.rewriter.ort_fusions.optimize_for_ort(onnx_program.model)
	print(fusion)
	# For production, remove metadata:
	result = ClearMetadataAndDocStringPass()(model)
	onnx_program.model = result.model
	onnx_program.save(output_path / "model.onnx")

	torch.onnx.testing.assert_onnx_program(onnx_program, atol=1e-3, rtol=1e-3)


	def main():
	exporter = Qwen3EmbeddingONNXExporter()
	exporter.load_model()
	exporter.export_to_onnx(output_dir="./qwen3-onnx-1028")


	if __name__ == "__main__":
	main()
No results found