Created
September 8, 2025 00:59
-
-
Save houmanka/6b845afbad9cf106ae28245c98a2defd to your computer and use it in GitHub Desktop.
tokeniser_mistakes.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "gpuType": "T4", | |
| "authorship_tag": "ABX9TyOxJhTSk+di3Er3a3e1QjkS", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| }, | |
| "accelerator": "GPU" | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/houmanka/6b845afbad9cf106ae28245c98a2defd/tokeniser_mistakes.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "b5OJJ0mMPtsu" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from transformers import AutoTokenizer, AutoModel\n", | |
| "\n", | |
| "model_id = \"bert-base-uncased\"\n", | |
| "tok = AutoTokenizer.from_pretrained(model_id)\n", | |
| "model = AutoModel.from_pretrained(model_id)\n", | |
| "\n", | |
| "vocab = len(tok)\n", | |
| "emb = model.get_input_embeddings().num_embeddings" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "assert vocab == emb or vocab <= emb, f\"Tokenizer/model mismatch: {vocab} vs {emb}\"" | |
| ], | |
| "metadata": { | |
| "id": "nE-pUH0YQW9D" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from transformers import AutoTokenizer\n", | |
| "\n", | |
| "tokenizer = AutoTokenizer.from_pretrained(\"openai-community/gpt2\")\n", | |
| "\n", | |
| "# Let's tokenize a batch of sentences of different lengths\n", | |
| "sentences = [\"Hello!\", \"How are you doing today?\"]\n", | |
| "\n", | |
| "# set the padding\n", | |
| "tokenizer.pad_token = tokenizer.eos_token\n", | |
| "\n", | |
| "# This will cause a warning or error!\n", | |
| "batch = tokenizer(sentences, padding=True, return_tensors=\"pt\")\n", | |
| "print(batch)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "Ejt1E9xKTft8", | |
| "outputId": "9ffb36cd-0132-46a4-c6fe-5ffd91439231" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "{'input_ids': tensor([[15496, 0, 50256, 50256, 50256, 50256],\n", | |
| " [ 2437, 389, 345, 1804, 1909, 30]]), 'attention_mask': tensor([[1, 1, 0, 0, 0, 0],\n", | |
| " [1, 1, 1, 1, 1, 1]])}\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from transformers import AutoTokenizer, AutoModelForCausalLM\n", | |
| "import torch\n", | |
| "\n", | |
| "model_id = \"gpt2\"\n", | |
| "\n", | |
| "# Base tokenizer/model (no extra tokens)\n", | |
| "tok_base = AutoTokenizer.from_pretrained(model_id)\n", | |
| "model_base = AutoModelForCausalLM.from_pretrained(model_id).eval()\n", | |
| "\n", | |
| "# Extended tokenizer that knows about <pending> and <ignored>\n", | |
| "tok_ext = AutoTokenizer.from_pretrained(model_id)\n", | |
| "added = tok_ext.add_special_tokens({\"additional_special_tokens\": [\"<pending>\", \"<ignored>\"]})\n", | |
| "\n", | |
| "if added > 0:\n", | |
| " model_base.resize_token_embeddings(len(tok_ext)) # Resize model_base as well\n", | |
| "\n", | |
| "sample = \"<pending> ACH debit 1234\"\n", | |
| "\n", | |
| "print(\"Before adding:\", tok_base.convert_ids_to_tokens(tok_base(sample)[\"input_ids\"]))\n", | |
| "print(\"After adding: \", tok_ext.convert_ids_to_tokens(tok_ext(sample)[\"input_ids\"]))\n", | |
| "print(\"additional_special_tokens:\", tok_ext.additional_special_tokens)\n", | |
| "print(\"additional_special_tokens_ids:\", tok_ext.additional_special_tokens_ids)\n", | |
| "print(\"Length of the token for before and after\", len(tok_base), len(tok_ext))\n", | |
| "\n", | |
| "enc = tok_ext(sample, return_tensors=\"pt\")\n", | |
| "with torch.no_grad():\n", | |
| " _ = model_base(**enc)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "5ZIttQ--ZLI_", | |
| "outputId": "edf8b781-4325-4758-8724-c9c326d711c2" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Before adding: ['<', 'p', 'ending', '>', 'ĠA', 'CH', 'Ġdebit', 'Ġ12', '34']\n", | |
| "After adding: ['<pending>', 'ĠA', 'CH', 'Ġdebit', 'Ġ12', '34']\n", | |
| "additional_special_tokens: ['<pending>', '<ignored>']\n", | |
| "additional_special_tokens_ids: [50257, 50258]\n", | |
| "Length of the token for before and after 50257 50259\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment