nph4rd/processory_mutability.py

## processory_mutability.py
# using `transformers==4.49.0` there's a input side-effect when calling the processor
# this example shows the case for Qwen2.5-VL

from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

messages1 = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]
messages2 = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "is there a person here?"},
        ],
    }
]

messages = [messages1, messages2]

print("="*40)
print("MULTIPLE MESSAGES")
texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
image_inputs, video_inputs = process_vision_info(messages)
texts_copy = texts.copy()
inputs = processor(
    text=texts,
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
print(f"modified `texts`? {not texts == texts_copy}")

print("="*40)
print("SINGLE MESSAGE")
text = processor.apply_chat_template(messages1, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages1)
text_copy = text
inputs = processor(
    text=text,
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
print(f"modified `text`? {not text == text_copy}")
	# using `transformers==4.49.0` there's a input side-effect when calling the processor
	# this example shows the case for Qwen2.5-VL

	from transformers import AutoProcessor
	from qwen_vl_utils import process_vision_info

	processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

	messages1 = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
	},
	{"type": "text", "text": "Describe this image."},
	],
	}
	]
	messages2 = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
	},
	{"type": "text", "text": "is there a person here?"},
	],
	}
	]

	messages = [messages1, messages2]

	print("="*40)
	print("MULTIPLE MESSAGES")
	texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
	image_inputs, video_inputs = process_vision_info(messages)
	texts_copy = texts.copy()
	inputs = processor(
	text=texts,
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	print(f"modified `texts`? {not texts == texts_copy}")

	print("="*40)
	print("SINGLE MESSAGE")
	text = processor.apply_chat_template(messages1, tokenize=False, add_generation_prompt=True)
	image_inputs, video_inputs = process_vision_info(messages1)
	text_copy = text
	inputs = processor(
	text=text,
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	print(f"modified `text`? {not text == text_copy}")
No results found