Created
July 30, 2024 17:40
-
-
Save rahulunair/c324df513d3d04010cc27d8b46eb2d99 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "231c110c-41f3-46a5-a74e-7dae8d218fc2", | |
| "metadata": {}, | |
| "source": [ | |
| "#### Introduction to Multi-Modal Inference with small VLM's on Intel GPUs\n", | |
| "\n", | |
| "Welcome to our workshop on leveraging Intel's data center GPUs for multi-modal inference. We begin by spending 2 minutes to understand how a VLM works and then let's set up our environment to utilize Intel's extensions for PyTorch, ensuring that we can access the GPU using PyTorch.\n", | |
| "___" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "40333ba4-dd56-45d3-a95c-a02a2ed267bc", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "\n", | |
| "<div style='text-align: center;'>\n", | |
| " <img src='vlm.png' />\n", | |
| "</div>\n" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "from IPython.display import HTML\n", | |
| "\n", | |
| "html_content = \"\"\"\n", | |
| "<div style='text-align: center;'>\n", | |
| " <img src='vlm.png' />\n", | |
| "</div>\n", | |
| "\"\"\"\n", | |
| "display(HTML(html_content))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "3c491fd3-36df-42d8-a8bb-ce85894058e4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import torch\n", | |
| "import intel_extension_for_pytorch as ipex" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "3ecad3d6-d5b1-4722-aca2-3acd8e8e4bee", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "torch.xpu.is_available()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "59f4eeda-71c2-4c18-b4aa-a1305b777955", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "('torch version: 2.1.0.post0+cxx11.abi', 'ipex version: 2.1.20+xpu')" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "\"torch version: \"+ torch.__version__, \"ipex version: \"+ ipex.__version__" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "57da83b6-2cd0-48fe-a76f-62af7e923f22", | |
| "metadata": {}, | |
| "source": [ | |
| "Let's import accelerate and transformers and check their versions. Ensure your transformers version is compatible with your model. If necessary, follow the example below to upgrade the libraries." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "1e1b596c-e3b6-4ad4-9a10-b76ed12ae425", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import transformers\n", | |
| "import accelerate" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "4fe0c161-1c07-4b89-b6a7-d2784eef64fc", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "('transformers: 4.40.2', 'acclerate version: 0.33.0')" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "\"transformers: \"+ transformers.__version__, \"acclerate version: \"+accelerate.__version__" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "827c85b1-ff0c-4490-8360-ea34451a44b1", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import sys\n", | |
| "\n", | |
| "# upgrading if its required, after package is installed, need to restart the kernel\n", | |
| "#!{sys.executable} -m pip install -U \"transformers==4.40.2\" trl accelerate\n", | |
| "#!{sys.executable} -m pip install -U einops" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "ff248e5f-1566-4273-bd58-367501ea9853", | |
| "metadata": {}, | |
| "source": [ | |
| "Le't select the `xpu` device if its available." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "99b1846d-a0d5-49a1-96f1-c030adcb6084", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "('device used: ', 'xpu')" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "device = \"xpu\" if torch.xpu.is_available() else \"cpu\"\n", | |
| "\"device used: \", device" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "912e57be-9792-490e-af22-f6bf45aab8a0", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from transformers import AutoModelForCausalLM, AutoTokenizer\n", | |
| "from transformers import TextIteratorStreamer # optional" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "bb8ac1e3-55e2-4f56-9235-4e9d4b8c4b28", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/uda71657c448970b6dcb6e71e8683151/.local/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", | |
| " warnings.warn(\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "model_id = \"vikhyatk/moondream2\"\n", | |
| "revision = \"2024-05-08\"\n", | |
| "model = AutoModelForCausalLM.from_pretrained(model_id, revision=revision, trust_remote_code=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "45a64f33-f72f-4f67-b75d-30a7c6bd23f2", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/opt/intel/oneapi/intelpython/envs/pytorch-gpu/lib/python3.9/site-packages/intel_extension_for_pytorch/frontend.py:465: UserWarning: Conv BatchNorm folding failed during the optimize process.\n", | |
| " warnings.warn(\n", | |
| "/opt/intel/oneapi/intelpython/envs/pytorch-gpu/lib/python3.9/site-packages/intel_extension_for_pytorch/frontend.py:472: UserWarning: Linear BatchNorm folding failed during the optimize process.\n", | |
| " warnings.warn(\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[2024-07-30 17:17:38,569] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to xpu (auto detect)\n", | |
| "OptimizedModule(\n", | |
| " (_orig_mod): Moondream(\n", | |
| " (vision_encoder): VisionEncoder(\n", | |
| " (encoder): EncoderWrapper(\n", | |
| " (model): ModuleDict(\n", | |
| " (visual): VisionTransformer(\n", | |
| " (patch_embed): LinearPatchEmbedding(\n", | |
| " (linear): Linear(in_features=588, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (blocks): Sequential(\n", | |
| " (0): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (1): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (2): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (3): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (4): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (5): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (6): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (7): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (8): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (9): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (10): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (11): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (12): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (13): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (14): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (15): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (16): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (17): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (18): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (19): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (20): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (21): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (22): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (23): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (24): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (25): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " (26): VitBlock(\n", | |
| " (attn): Attention(\n", | |
| " (qkv): Linear(in_features=1152, out_features=3456, bias=True)\n", | |
| " (proj): Linear(in_features=1152, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", | |
| " )\n", | |
| " (norm1): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " (norm2): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " )\n", | |
| " (norm): LayerNorm((1152,), eps=1e-05, elementwise_affine=True)\n", | |
| " )\n", | |
| " )\n", | |
| " )\n", | |
| " (projection): VisionProjection(\n", | |
| " (mlp): MLP(\n", | |
| " (fc1): Linear(in_features=1152, out_features=8192, bias=True)\n", | |
| " (act): GELU(approximate='tanh')\n", | |
| " (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n", | |
| " )\n", | |
| " )\n", | |
| " (preprocess): Compose(\n", | |
| " Resize(size=[378, 378], interpolation=InterpolationMode.BICUBIC, antialias=warn)\n", | |
| " ToImage()\n", | |
| " ToDtype(scale=True)\n", | |
| " Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=False)\n", | |
| " )\n", | |
| " )\n", | |
| " (text_model): PhiForCausalLM(\n", | |
| " (transformer): PhiModel(\n", | |
| " (embd): Embedding(\n", | |
| " (wte): Embedding(51200, 2048)\n", | |
| " )\n", | |
| " (embed_dropout): Identity()\n", | |
| " (h): ModuleList(\n", | |
| " (0-23): 24 x PhiDecoderLayer(\n", | |
| " (mixer): PhiAttention(\n", | |
| " (Wqkv): Linear(in_features=2048, out_features=6144, bias=True)\n", | |
| " (out_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", | |
| " (rotary_emb): PhiRotaryEmbedding()\n", | |
| " )\n", | |
| " (mlp): PhiMLP(\n", | |
| " (activation_fn): NewGELUActivation()\n", | |
| " (fc1): Linear(in_features=2048, out_features=8192, bias=True)\n", | |
| " (fc2): Linear(in_features=8192, out_features=2048, bias=True)\n", | |
| " )\n", | |
| " (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", | |
| " (resid_dropout): Identity()\n", | |
| " )\n", | |
| " )\n", | |
| " )\n", | |
| " (lm_head): CausalLMHead(\n", | |
| " (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)\n", | |
| " (linear): Linear(in_features=2048, out_features=51200, bias=True)\n", | |
| " )\n", | |
| " )\n", | |
| " )\n", | |
| ")\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "model = model.eval()\n", | |
| "model = model.to(device=device, dtype=torch.bfloat16)\n", | |
| "\n", | |
| "model = ipex.optimize(model=model, dtype=torch.bfloat16) # optional\n", | |
| "model = torch.compile(model)\n", | |
| "print(model)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "9edfa02b-111b-4f7c-9b7c-fd00d6e24b50", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/uda71657c448970b6dcb6e71e8683151/.local/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", | |
| " warnings.warn(\n", | |
| "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "fb5c7ba4-d5c8-44a4-8fbf-3c822d13cc66", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import torchvision.transforms as transforms\n", | |
| "from PIL import Image\n", | |
| "import requests\n", | |
| "from io import BytesIO\n", | |
| "\n", | |
| "# Cache for storing image embeddings\n", | |
| "image_cache = {}\n", | |
| "\n", | |
| "\n", | |
| "def preprocess_image(image_path_or_url):\n", | |
| " if image_path_or_url.startswith(\"http\"):\n", | |
| " response = requests.get(image_path_or_url)\n", | |
| " image = Image.open(BytesIO(response.content))\n", | |
| " else:\n", | |
| " image = Image.open(image_path_or_url)\n", | |
| " return image\n", | |
| "\n", | |
| "\n", | |
| "def encode_image(image_path_or_url):\n", | |
| " if image_path_or_url in image_cache:\n", | |
| " return image_cache[image_path_or_url]\n", | |
| " else:\n", | |
| " image = preprocess_image(image_path_or_url)\n", | |
| " image_embeds = model.encode_image(image)\n", | |
| " image_cache[image_path_or_url] = image_embeds\n", | |
| " return image_embeds\n", | |
| "\n", | |
| "\n", | |
| "def generate(image_embeds, prompt, max_new_tokens=128):\n", | |
| " generated_text = model.generate(\n", | |
| " image_embeds,\n", | |
| " prompt,\n", | |
| " tokenizer,\n", | |
| " max_new_tokens=max_new_tokens,\n", | |
| " )[0]\n", | |
| " return generated_text\n", | |
| "\n", | |
| "\n", | |
| "def answer_question(image_embeds, question):\n", | |
| " streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)\n", | |
| " for token in model.answer_question(\n", | |
| " image_embeds, question, tokenizer, streamer=streamer\n", | |
| " ):\n", | |
| " yield token" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "95b54479-76e6-424b-9e9c-f62c56ccce34", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Tables\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<img src=\"https://images.unsplash.com/photo-1721820859051-6154b2852863\"/>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.Image object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Book_shelf\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<img src=\"https://images.unsplash.com/photo-1722182877533-7378b60bf1e8\"/>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.Image object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "from IPython.display import Image as IPyImage, display\n", | |
| "\n", | |
| "# Sample images\n", | |
| "image_urls = {#\"dice\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png\",\n", | |
| " \"tables\": \"https://images.unsplash.com/photo-1721820859051-6154b2852863\",\n", | |
| " \"book_shelf\": \"https://images.unsplash.com/photo-1722182877533-7378b60bf1e8\"\n", | |
| " }\n", | |
| "for description, url in image_urls.items():\n", | |
| " print(description.capitalize())\n", | |
| " display(IPyImage(url=url))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "d9184c37-0a5b-4c8f-8cb5-9177b45b9ce2", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Testing image encoding...\n", | |
| "tables encoded successfully in 0.00000811 sec, shape is: torch.Size([1, 729, 2048]).\n", | |
| "book_shelf encoded successfully in 0.00000262 sec, shape is: torch.Size([1, 729, 2048]).\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Test image encoding\n", | |
| "import time\n", | |
| "\n", | |
| "image_embeds = []\n", | |
| "print(\"Testing image encoding...\")\n", | |
| "for description, url in image_urls.items():\n", | |
| " t1 = time.time()\n", | |
| " image_embed = encode_image(url) # IMAGE PROJECTIONS\n", | |
| " image_embeds.append(image_embed)\n", | |
| " print(f\"{description} encoded successfully in {time.time() - t1: .8f} sec, shape is: {image_embed.shape}.\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "c71d5671-e912-4ed2-9819-7b972d6e5868", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[tensor([[[ 3.4375, -3.1562, 0.5938, ..., 3.2188, -1.7734, -2.3125],\n", | |
| " [ 2.0781, 0.2695, 2.6719, ..., 2.5781, 1.8281, -2.1562],\n", | |
| " [-3.6094, 0.9922, 1.9297, ..., -2.3438, 3.0156, -0.8008],\n", | |
| " ...,\n", | |
| " [ 9.6250, 3.9219, 2.0469, ..., -2.8906, -0.6133, -2.2656],\n", | |
| " [ 1.1953, 1.5547, 2.7188, ..., 1.6016, -1.8359, -1.0625],\n", | |
| " [ 2.9219, -0.5859, -0.0879, ..., 0.9805, 0.6250, -1.3359]]],\n", | |
| " device='xpu:0', dtype=torch.bfloat16), tensor([[[ 3.2500, 0.7539, 0.9062, ..., 1.7969, -3.3281, -2.5469],\n", | |
| " [-3.2812, 1.1875, 1.4219, ..., -1.6641, 2.8438, -0.0996],\n", | |
| " [-2.8125, 1.1016, 1.6719, ..., -1.7578, 3.0000, -0.7500],\n", | |
| " ...,\n", | |
| " [ 0.0938, 0.8867, 0.1177, ..., 1.5859, -1.2500, -4.6250],\n", | |
| " [-0.4785, -1.6797, 0.1367, ..., -0.7734, 1.5234, -0.0273],\n", | |
| " [ 0.8398, -0.7422, 1.1797, ..., -1.1484, 0.4492, -0.4336]]],\n", | |
| " device='xpu:0', dtype=torch.bfloat16)]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(image_embeds)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "id": "24cc8f8f-0fc0-4f5a-89cd-1958e9b5b044", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n", | |
| "Testing text generation...\n", | |
| "\n", | |
| "The image depicts a cozy dining room with a wooden table surrounded by six matching chairs. The table is set with a vase of flowers, adding a touch of elegance to the scene. The room is illuminated by a chandelier hanging from the ceiling, casting a warm glow over the space. The walls are painted white, creating a bright and airy atmosphere. On the right side of the room, there is a window with white curtains, allowing natural light to filter into the room. A hat is hanging on the wall, adding a personal touch to the space.\n", | |
| "\n", | |
| " ------------------------------------------------------------------------\n", | |
| "The image depicts a serene and dimly lit library scene. The focal point is a large wooden bookshelf filled with numerous books of various sizes and colors. The bookshelf is situated against a wall, with a desk in front of it. On the desk, there is a small book and a picture frame. The room is further illuminated by a single light source, casting a warm glow on the bookshelf and desk. In the background, there is a chair and a stool, both positioned against the wall.\n", | |
| "\n", | |
| " ------------------------------------------------------------------------\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(\"\\nTesting text generation...\\n\")\n", | |
| "#prompt = \"The image is a depiction of \"\n", | |
| "question = \"What is this image, give me the details of each object?\"\n", | |
| "\n", | |
| "for ie in image_embeds:\n", | |
| " for token in answer_question(ie, question):\n", | |
| " print(token, end='')\n", | |
| " print(\"\\n\\n\",\"-\"*72)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "7f3db31a-6d91-4a93-983f-6c84c0f49de6", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "PyTorch GPU", | |
| "language": "python", | |
| "name": "pytorch-gpu" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.18" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment