houmanka/tokeniser_mistakes.ipynb

## tokeniser_mistakes.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyOxJhTSk+di3Er3a3e1QjkS",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/houmanka/6b845afbad9cf106ae28245c98a2defd/tokeniser_mistakes.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "b5OJJ0mMPtsu"
      },
      "outputs": [],
      "source": [
        "from transformers import AutoTokenizer, AutoModel\n",
        "\n",
        "model_id = \"bert-base-uncased\"\n",
        "tok = AutoTokenizer.from_pretrained(model_id)\n",
        "model = AutoModel.from_pretrained(model_id)\n",
        "\n",
        "vocab = len(tok)\n",
        "emb = model.get_input_embeddings().num_embeddings"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "assert vocab == emb or vocab <= emb, f\"Tokenizer/model mismatch: {vocab} vs {emb}\""
      ],
      "metadata": {
        "id": "nE-pUH0YQW9D"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import AutoTokenizer\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"openai-community/gpt2\")\n",
        "\n",
        "# Let's tokenize a batch of sentences of different lengths\n",
        "sentences = [\"Hello!\", \"How are you doing today?\"]\n",
        "\n",
        "# set the padding\n",
        "tokenizer.pad_token = tokenizer.eos_token\n",
        "\n",
        "# This will cause a warning or error!\n",
        "batch = tokenizer(sentences, padding=True, return_tensors=\"pt\")\n",
        "print(batch)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Ejt1E9xKTft8",
        "outputId": "9ffb36cd-0132-46a4-c6fe-5ffd91439231"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'input_ids': tensor([[15496,     0, 50256, 50256, 50256, 50256],\n",
            "        [ 2437,   389,   345,  1804,  1909,    30]]), 'attention_mask': tensor([[1, 1, 0, 0, 0, 0],\n",
            "        [1, 1, 1, 1, 1, 1]])}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
        "import torch\n",
        "\n",
        "model_id = \"gpt2\"\n",
        "\n",
        "# Base tokenizer/model (no extra tokens)\n",
        "tok_base = AutoTokenizer.from_pretrained(model_id)\n",
        "model_base = AutoModelForCausalLM.from_pretrained(model_id).eval()\n",
        "\n",
        "# Extended tokenizer that knows about <pending> and <ignored>\n",
        "tok_ext = AutoTokenizer.from_pretrained(model_id)\n",
        "added = tok_ext.add_special_tokens({\"additional_special_tokens\": [\"<pending>\", \"<ignored>\"]})\n",
        "\n",
        "if added > 0:\n",
        "    model_base.resize_token_embeddings(len(tok_ext)) # Resize model_base as well\n",
        "\n",
        "sample = \"<pending> ACH debit 1234\"\n",
        "\n",
        "print(\"Before adding:\", tok_base.convert_ids_to_tokens(tok_base(sample)[\"input_ids\"]))\n",
        "print(\"After adding: \", tok_ext.convert_ids_to_tokens(tok_ext(sample)[\"input_ids\"]))\n",
        "print(\"additional_special_tokens:\", tok_ext.additional_special_tokens)\n",
        "print(\"additional_special_tokens_ids:\", tok_ext.additional_special_tokens_ids)\n",
        "print(\"Length of the token for before and after\", len(tok_base), len(tok_ext))\n",
        "\n",
        "enc = tok_ext(sample, return_tensors=\"pt\")\n",
        "with torch.no_grad():\n",
        "  _ = model_base(**enc)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5ZIttQ--ZLI_",
        "outputId": "edf8b781-4325-4758-8724-c9c326d711c2"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Before adding: ['<', 'p', 'ending', '>', 'ĠA', 'CH', 'Ġdebit', 'Ġ12', '34']\n",
            "After adding:  ['<pending>', 'ĠA', 'CH', 'Ġdebit', 'Ġ12', '34']\n",
            "additional_special_tokens: ['<pending>', '<ignored>']\n",
            "additional_special_tokens_ids: [50257, 50258]\n",
            "Length of the token for before and after 50257 50259\n"
          ]
        }
      ]
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"gpuType": "T4",
	"authorship_tag": "ABX9TyOxJhTSk+di3Er3a3e1QjkS",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/houmanka/6b845afbad9cf106ae28245c98a2defd/tokeniser_mistakes.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "b5OJJ0mMPtsu"
	},
	"outputs": [],
	"source": [
	"from transformers import AutoTokenizer, AutoModel\n",
	"\n",
	"model_id = \"bert-base-uncased\"\n",
	"tok = AutoTokenizer.from_pretrained(model_id)\n",
	"model = AutoModel.from_pretrained(model_id)\n",
	"\n",
	"vocab = len(tok)\n",
	"emb = model.get_input_embeddings().num_embeddings"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"assert vocab == emb or vocab <= emb, f\"Tokenizer/model mismatch: {vocab} vs {emb}\""
	],
	"metadata": {
	"id": "nE-pUH0YQW9D"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"from transformers import AutoTokenizer\n",
	"\n",
	"tokenizer = AutoTokenizer.from_pretrained(\"openai-community/gpt2\")\n",
	"\n",
	"# Let's tokenize a batch of sentences of different lengths\n",
	"sentences = [\"Hello!\", \"How are you doing today?\"]\n",
	"\n",
	"# set the padding\n",
	"tokenizer.pad_token = tokenizer.eos_token\n",
	"\n",
	"# This will cause a warning or error!\n",
	"batch = tokenizer(sentences, padding=True, return_tensors=\"pt\")\n",
	"print(batch)"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "Ejt1E9xKTft8",
	"outputId": "9ffb36cd-0132-46a4-c6fe-5ffd91439231"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"{'input_ids': tensor([[15496, 0, 50256, 50256, 50256, 50256],\n",
	" [ 2437, 389, 345, 1804, 1909, 30]]), 'attention_mask': tensor([[1, 1, 0, 0, 0, 0],\n",
	" [1, 1, 1, 1, 1, 1]])}\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from transformers import AutoTokenizer, AutoModelForCausalLM\n",
	"import torch\n",
	"\n",
	"model_id = \"gpt2\"\n",
	"\n",
	"# Base tokenizer/model (no extra tokens)\n",
	"tok_base = AutoTokenizer.from_pretrained(model_id)\n",
	"model_base = AutoModelForCausalLM.from_pretrained(model_id).eval()\n",
	"\n",
	"# Extended tokenizer that knows about <pending> and <ignored>\n",
	"tok_ext = AutoTokenizer.from_pretrained(model_id)\n",
	"added = tok_ext.add_special_tokens({\"additional_special_tokens\": [\"<pending>\", \"<ignored>\"]})\n",
	"\n",
	"if added > 0:\n",
	" model_base.resize_token_embeddings(len(tok_ext)) # Resize model_base as well\n",
	"\n",
	"sample = \"<pending> ACH debit 1234\"\n",
	"\n",
	"print(\"Before adding:\", tok_base.convert_ids_to_tokens(tok_base(sample)[\"input_ids\"]))\n",
	"print(\"After adding: \", tok_ext.convert_ids_to_tokens(tok_ext(sample)[\"input_ids\"]))\n",
	"print(\"additional_special_tokens:\", tok_ext.additional_special_tokens)\n",
	"print(\"additional_special_tokens_ids:\", tok_ext.additional_special_tokens_ids)\n",
	"print(\"Length of the token for before and after\", len(tok_base), len(tok_ext))\n",
	"\n",
	"enc = tok_ext(sample, return_tensors=\"pt\")\n",
	"with torch.no_grad():\n",
	" _ = model_base(**enc)"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "5ZIttQ--ZLI_",
	"outputId": "edf8b781-4325-4758-8724-c9c326d711c2"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Before adding: ['<', 'p', 'ending', '>', 'ĠA', 'CH', 'Ġdebit', 'Ġ12', '34']\n",
	"After adding: ['<pending>', 'ĠA', 'CH', 'Ġdebit', 'Ġ12', '34']\n",
	"additional_special_tokens: ['<pending>', '<ignored>']\n",
	"additional_special_tokens_ids: [50257, 50258]\n",
	"Length of the token for before and after 50257 50259\n"
	]
	}
	]
	}
	]
	}
No results found