Skip to content

Instantly share code, notes, and snippets.

@houmanka
Created September 8, 2025 00:59
Show Gist options
  • Select an option

  • Save houmanka/6b845afbad9cf106ae28245c98a2defd to your computer and use it in GitHub Desktop.

Select an option

Save houmanka/6b845afbad9cf106ae28245c98a2defd to your computer and use it in GitHub Desktop.
tokeniser_mistakes.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyOxJhTSk+di3Er3a3e1QjkS",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/houmanka/6b845afbad9cf106ae28245c98a2defd/tokeniser_mistakes.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "b5OJJ0mMPtsu"
},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModel\n",
"\n",
"model_id = \"bert-base-uncased\"\n",
"tok = AutoTokenizer.from_pretrained(model_id)\n",
"model = AutoModel.from_pretrained(model_id)\n",
"\n",
"vocab = len(tok)\n",
"emb = model.get_input_embeddings().num_embeddings"
]
},
{
"cell_type": "code",
"source": [
"assert vocab == emb or vocab <= emb, f\"Tokenizer/model mismatch: {vocab} vs {emb}\""
],
"metadata": {
"id": "nE-pUH0YQW9D"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"openai-community/gpt2\")\n",
"\n",
"# Let's tokenize a batch of sentences of different lengths\n",
"sentences = [\"Hello!\", \"How are you doing today?\"]\n",
"\n",
"# set the padding\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
"# This will cause a warning or error!\n",
"batch = tokenizer(sentences, padding=True, return_tensors=\"pt\")\n",
"print(batch)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Ejt1E9xKTft8",
"outputId": "9ffb36cd-0132-46a4-c6fe-5ffd91439231"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'input_ids': tensor([[15496, 0, 50256, 50256, 50256, 50256],\n",
" [ 2437, 389, 345, 1804, 1909, 30]]), 'attention_mask': tensor([[1, 1, 0, 0, 0, 0],\n",
" [1, 1, 1, 1, 1, 1]])}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from transformers import AutoTokenizer, AutoModelForCausalLM\n",
"import torch\n",
"\n",
"model_id = \"gpt2\"\n",
"\n",
"# Base tokenizer/model (no extra tokens)\n",
"tok_base = AutoTokenizer.from_pretrained(model_id)\n",
"model_base = AutoModelForCausalLM.from_pretrained(model_id).eval()\n",
"\n",
"# Extended tokenizer that knows about <pending> and <ignored>\n",
"tok_ext = AutoTokenizer.from_pretrained(model_id)\n",
"added = tok_ext.add_special_tokens({\"additional_special_tokens\": [\"<pending>\", \"<ignored>\"]})\n",
"\n",
"if added > 0:\n",
" model_base.resize_token_embeddings(len(tok_ext)) # Resize model_base as well\n",
"\n",
"sample = \"<pending> ACH debit 1234\"\n",
"\n",
"print(\"Before adding:\", tok_base.convert_ids_to_tokens(tok_base(sample)[\"input_ids\"]))\n",
"print(\"After adding: \", tok_ext.convert_ids_to_tokens(tok_ext(sample)[\"input_ids\"]))\n",
"print(\"additional_special_tokens:\", tok_ext.additional_special_tokens)\n",
"print(\"additional_special_tokens_ids:\", tok_ext.additional_special_tokens_ids)\n",
"print(\"Length of the token for before and after\", len(tok_base), len(tok_ext))\n",
"\n",
"enc = tok_ext(sample, return_tensors=\"pt\")\n",
"with torch.no_grad():\n",
" _ = model_base(**enc)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5ZIttQ--ZLI_",
"outputId": "edf8b781-4325-4758-8724-c9c326d711c2"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Before adding: ['<', 'p', 'ending', '>', 'ĠA', 'CH', 'Ġdebit', 'Ġ12', '34']\n",
"After adding: ['<pending>', 'ĠA', 'CH', 'Ġdebit', 'Ġ12', '34']\n",
"additional_special_tokens: ['<pending>', '<ignored>']\n",
"additional_special_tokens_ids: [50257, 50258]\n",
"Length of the token for before and after 50257 50259\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment