Skip to content

Instantly share code, notes, and snippets.

@alonsosilvaallende
Created January 21, 2026 06:50
Show Gist options
  • Select an option

  • Save alonsosilvaallende/e27dc02662364fa14df96de4befabdd5 to your computer and use it in GitHub Desktop.

Select an option

Save alonsosilvaallende/e27dc02662364fa14df96de4befabdd5 to your computer and use it in GitHub Desktop.
Understanding_Tokenizers.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyOKOd6oU+h/gOMSJPu4ArMn",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"0b574b40930348f391f76cb710a5fef0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_9e6c507ef0d1480d957c574a24ae2be3",
"IPY_MODEL_affeaccb52ea4533914e0e609ac52896",
"IPY_MODEL_a5263be22bc942c6b0292acf7b84dde8"
],
"layout": "IPY_MODEL_293f22e1d6ac4a3b953283d67cd77a50"
}
},
"9e6c507ef0d1480d957c574a24ae2be3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_be10465a38ea4c5a800b42c3d66cbd64",
"placeholder": "​",
"style": "IPY_MODEL_4d61bfc6082c40f9b2e404c4b113423f",
"value": "tokenizer_config.json: "
}
},
"affeaccb52ea4533914e0e609ac52896": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_bced312cb926480c96b3a3925c8df7ad",
"max": 1,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_1d1fdfe84df6457e946511ac904d7e88",
"value": 1
}
},
"a5263be22bc942c6b0292acf7b84dde8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_9be8172f8f654d7d8e46ee1a8b2adacc",
"placeholder": "​",
"style": "IPY_MODEL_cf5c27b2fb5f44eca82ae420f09d56be",
"value": " 9.73k/? [00:00<00:00, 748kB/s]"
}
},
"293f22e1d6ac4a3b953283d67cd77a50": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"be10465a38ea4c5a800b42c3d66cbd64": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4d61bfc6082c40f9b2e404c4b113423f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"bced312cb926480c96b3a3925c8df7ad": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "20px"
}
},
"1d1fdfe84df6457e946511ac904d7e88": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"9be8172f8f654d7d8e46ee1a8b2adacc": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"cf5c27b2fb5f44eca82ae420f09d56be": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"989777ad00b047a8aacb29720f2089e2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_bb1107fb68034380a3b7772130ba9a4b",
"IPY_MODEL_05a78d4ad2a94e7392f7940bf194f6d9",
"IPY_MODEL_3f07bbd6313a49f3bbfe1fd15f275c1d"
],
"layout": "IPY_MODEL_fa1e711628d346c580e689187a5cba19"
}
},
"bb1107fb68034380a3b7772130ba9a4b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_54dade05b2324fe19a52a64be9029961",
"placeholder": "​",
"style": "IPY_MODEL_957cc851105f46a882da35e647610549",
"value": "vocab.json: "
}
},
"05a78d4ad2a94e7392f7940bf194f6d9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_226ec5656834484ba97e606d2b0a2913",
"max": 1,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_db36b50df77f40d0bc9d7ba605b30006",
"value": 1
}
},
"3f07bbd6313a49f3bbfe1fd15f275c1d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c570da8d73f440baaecc4d28ed2f172e",
"placeholder": "​",
"style": "IPY_MODEL_f62e532f460448f797fee8b62c48beaf",
"value": " 2.78M/? [00:00<00:00, 33.3MB/s]"
}
},
"fa1e711628d346c580e689187a5cba19": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"54dade05b2324fe19a52a64be9029961": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"957cc851105f46a882da35e647610549": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"226ec5656834484ba97e606d2b0a2913": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "20px"
}
},
"db36b50df77f40d0bc9d7ba605b30006": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"c570da8d73f440baaecc4d28ed2f172e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"f62e532f460448f797fee8b62c48beaf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"8fcb03020e4a4f17a3c10db5f9c0f4ce": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_3ac30d12448a4776968d6c6afbb95373",
"IPY_MODEL_722f642c85f44485bb49bdbae30a9bb4",
"IPY_MODEL_4f11c5da0ebc4f71aaffd36f0b9cf926"
],
"layout": "IPY_MODEL_dcdf712cc5d24bdbac7b29cb6938eb50"
}
},
"3ac30d12448a4776968d6c6afbb95373": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_08607b5313484ca281244515722ddbb6",
"placeholder": "​",
"style": "IPY_MODEL_ba7cf666ba864da585c7bf6d0069a021",
"value": "merges.txt: "
}
},
"722f642c85f44485bb49bdbae30a9bb4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_8322dd29d22448d39d2bbd58e4eaa5b6",
"max": 1,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_0efb1ff555b040a5ab33c4d44d691e59",
"value": 1
}
},
"4f11c5da0ebc4f71aaffd36f0b9cf926": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_da3642cb11394068a27c7242909780c1",
"placeholder": "​",
"style": "IPY_MODEL_6dd2f219748c433cb49427749a748807",
"value": " 1.67M/? [00:00<00:00, 46.1MB/s]"
}
},
"dcdf712cc5d24bdbac7b29cb6938eb50": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"08607b5313484ca281244515722ddbb6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ba7cf666ba864da585c7bf6d0069a021": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"8322dd29d22448d39d2bbd58e4eaa5b6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "20px"
}
},
"0efb1ff555b040a5ab33c4d44d691e59": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"da3642cb11394068a27c7242909780c1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6dd2f219748c433cb49427749a748807": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"4a035fa8367341c18ad3c8f90b95d4c8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_260bdcd75f1b4258a992585113ee588f",
"IPY_MODEL_3517d460e1a54b768c5aba83f826d9b5",
"IPY_MODEL_bcb053042f9b495e87414f419b4f0e1b"
],
"layout": "IPY_MODEL_0f0c9421c014416286e44e5229bcb31b"
}
},
"260bdcd75f1b4258a992585113ee588f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_8dfa6904989841bbacc2c59b64ecb09b",
"placeholder": "​",
"style": "IPY_MODEL_49ec998705474f638f066ede6ca9fc63",
"value": "tokenizer.json: 100%"
}
},
"3517d460e1a54b768c5aba83f826d9b5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b4801c75518d492eb894b895f990b300",
"max": 11422654,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_40f85013f5664c689114758f5bece86b",
"value": 11422654
}
},
"bcb053042f9b495e87414f419b4f0e1b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_50c25a3f033947da9375471a93b92b07",
"placeholder": "​",
"style": "IPY_MODEL_41dba62478274dac9bd00e68b24306b8",
"value": " 11.4M/11.4M [00:00<00:00, 91.7kB/s]"
}
},
"0f0c9421c014416286e44e5229bcb31b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"8dfa6904989841bbacc2c59b64ecb09b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"49ec998705474f638f066ede6ca9fc63": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"b4801c75518d492eb894b895f990b300": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"40f85013f5664c689114758f5bece86b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"50c25a3f033947da9375471a93b92b07": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"41dba62478274dac9bd00e68b24306b8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"e3216531346f4d2a83607855b11ef48c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_031f050b2b2845c086533089feaac1df",
"IPY_MODEL_59f4c1c62c734840981fd438249dfd88",
"IPY_MODEL_b87bd9c94c2d4fa5969ef49539355756"
],
"layout": "IPY_MODEL_d64a1730b67a48dea897e462ce7cd3b0"
}
},
"031f050b2b2845c086533089feaac1df": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_13e4655bda7c4457910a8672fa56d53e",
"placeholder": "​",
"style": "IPY_MODEL_abec60ab0d7541a8bb86d448615dcc5b",
"value": "tokenizer_config.json: "
}
},
"59f4c1c62c734840981fd438249dfd88": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_a287d86221e748cbb80c9f0902008b82",
"max": 1,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_c6b73a43593648f5ad6a652c2c1a2267",
"value": 1
}
},
"b87bd9c94c2d4fa5969ef49539355756": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_495cd8e87783407f89f7d058e0dd6e1e",
"placeholder": "​",
"style": "IPY_MODEL_4c1dec4b849e4db5bfd3fdc73912f59e",
"value": " 141k/? [00:00<00:00, 9.03MB/s]"
}
},
"d64a1730b67a48dea897e462ce7cd3b0": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"13e4655bda7c4457910a8672fa56d53e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"abec60ab0d7541a8bb86d448615dcc5b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"a287d86221e748cbb80c9f0902008b82": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "20px"
}
},
"c6b73a43593648f5ad6a652c2c1a2267": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"495cd8e87783407f89f7d058e0dd6e1e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4c1dec4b849e4db5bfd3fdc73912f59e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"6870bd6f061345779f6971d6f23858f9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_94a137889d3748138cfbd115cab5a1b9",
"IPY_MODEL_ae7ef11818fb4062bf90bb1e23c6faaa",
"IPY_MODEL_1ef9a88a75624d5fac59495231cf66cd"
],
"layout": "IPY_MODEL_c2ec3d60a7d84334b55b6775c08b278b"
}
},
"94a137889d3748138cfbd115cab5a1b9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_36aa549f24434103b31d8619962fc2b3",
"placeholder": "​",
"style": "IPY_MODEL_d0dc7cc5ecb9486e9e3485eafd47eb4b",
"value": "tokenizer.model: 100%"
}
},
"ae7ef11818fb4062bf90bb1e23c6faaa": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_02203f7643f049a5932ab7ea8049999d",
"max": 587404,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_fdeade0d22aa42fdac1bbfcbdb22437b",
"value": 587404
}
},
"1ef9a88a75624d5fac59495231cf66cd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_11f821925f784f05871300de7dfb70f6",
"placeholder": "​",
"style": "IPY_MODEL_c40babea8e284b11ade33d7256e30d63",
"value": " 587k/587k [00:00<00:00, 1.46MB/s]"
}
},
"c2ec3d60a7d84334b55b6775c08b278b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"36aa549f24434103b31d8619962fc2b3": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d0dc7cc5ecb9486e9e3485eafd47eb4b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"02203f7643f049a5932ab7ea8049999d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"fdeade0d22aa42fdac1bbfcbdb22437b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"11f821925f784f05871300de7dfb70f6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c40babea8e284b11ade33d7256e30d63": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"72ba5bc386494b54a00d6390610ae3ef": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_2a713f6e8b6746aba7435f99564384c8",
"IPY_MODEL_5004480cc683446b8c6bd5f2f8e4e3fc",
"IPY_MODEL_024c5d8848a2468c94609157f5b9915e"
],
"layout": "IPY_MODEL_43f05329275840d081cc344b5884f8ff"
}
},
"2a713f6e8b6746aba7435f99564384c8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_1b17344f5c3945699d3fb51aba7059ce",
"placeholder": "​",
"style": "IPY_MODEL_3e68ed03e8ed44f8aa3636c7452c6777",
"value": "tokenizer.json: "
}
},
"5004480cc683446b8c6bd5f2f8e4e3fc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_8a0cb438d9ef4b01acc5de3124586ee8",
"max": 1,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_581003d4da184e2d89ede83d553540e8",
"value": 1
}
},
"024c5d8848a2468c94609157f5b9915e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_e7f889ce966d409b93a515a97abc51d1",
"placeholder": "​",
"style": "IPY_MODEL_426501375aa448f08017bf25fb357cfa",
"value": " 1.96M/? [00:00<00:00, 53.2MB/s]"
}
},
"43f05329275840d081cc344b5884f8ff": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"1b17344f5c3945699d3fb51aba7059ce": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"3e68ed03e8ed44f8aa3636c7452c6777": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"8a0cb438d9ef4b01acc5de3124586ee8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "20px"
}
},
"581003d4da184e2d89ede83d553540e8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"e7f889ce966d409b93a515a97abc51d1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"426501375aa448f08017bf25fb357cfa": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"befc13c0530349f8a659b08a4eed4b5e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_a64cac507c7449b08b38c83446d9fb22",
"IPY_MODEL_7c0370e8bf12479287275a8fdfd7ac75",
"IPY_MODEL_69754a6b775d478ca5edd2717d1d04cc"
],
"layout": "IPY_MODEL_0e193233f20b4abca4b3c286a09492a0"
}
},
"a64cac507c7449b08b38c83446d9fb22": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_4c2f9112ef884759ac6d8e7915139ddd",
"placeholder": "​",
"style": "IPY_MODEL_2d606887b67643b1b29a198b5964cfc0",
"value": "special_tokens_map.json: 100%"
}
},
"7c0370e8bf12479287275a8fdfd7ac75": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_801b377b5750415ab88c629c7eb42b28",
"max": 414,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_f18644ff8add47bdb91e04f39efde5e5",
"value": 414
}
},
"69754a6b775d478ca5edd2717d1d04cc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_e3a43c4674fc4529ae5c1f27a15b8588",
"placeholder": "​",
"style": "IPY_MODEL_ad13e3954d2c43c18ec1ddfc411a8302",
"value": " 414/414 [00:00<00:00, 44.4kB/s]"
}
},
"0e193233f20b4abca4b3c286a09492a0": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4c2f9112ef884759ac6d8e7915139ddd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"2d606887b67643b1b29a198b5964cfc0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"801b377b5750415ab88c629c7eb42b28": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"f18644ff8add47bdb91e04f39efde5e5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"e3a43c4674fc4529ae5c1f27a15b8588": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ad13e3954d2c43c18ec1ddfc411a8302": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/alonsosilvaallende/e27dc02662364fa14df96de4befabdd5/understanding_tokenizers.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"Language Models don't work with words, they work with tokens. They take text, convert it into tokens (integers), then predict which tokens should come next.\n",
"\n",
"Let's look at a [tokenizer app](https://huggingface.co/spaces/alonsosilva/tokenizer). Or [another tokenizer app](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)\n",
"\n",
"Let's consider a text we want to tokenize:"
],
"metadata": {
"id": "jVdN5NTa-GLl"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "FisHEQif988T"
},
"outputs": [],
"source": [
"text = \"The dog eats the apples.\""
]
},
{
"cell_type": "markdown",
"source": [
"Each language model has its own tokenizer, so we need to specify which model we are going to use:"
],
"metadata": {
"id": "f8737bmD-X9l"
}
},
{
"cell_type": "code",
"source": [
"MODEL_ID = \"Qwen/Qwen3-0.6B\""
],
"metadata": {
"id": "0UNZu9nj-T5h"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"We download the model's tokenizer:"
],
"metadata": {
"id": "KowfvdtaSQvp"
}
},
{
"cell_type": "code",
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 255,
"referenced_widgets": [
"0b574b40930348f391f76cb710a5fef0",
"9e6c507ef0d1480d957c574a24ae2be3",
"affeaccb52ea4533914e0e609ac52896",
"a5263be22bc942c6b0292acf7b84dde8",
"293f22e1d6ac4a3b953283d67cd77a50",
"be10465a38ea4c5a800b42c3d66cbd64",
"4d61bfc6082c40f9b2e404c4b113423f",
"bced312cb926480c96b3a3925c8df7ad",
"1d1fdfe84df6457e946511ac904d7e88",
"9be8172f8f654d7d8e46ee1a8b2adacc",
"cf5c27b2fb5f44eca82ae420f09d56be",
"989777ad00b047a8aacb29720f2089e2",
"bb1107fb68034380a3b7772130ba9a4b",
"05a78d4ad2a94e7392f7940bf194f6d9",
"3f07bbd6313a49f3bbfe1fd15f275c1d",
"fa1e711628d346c580e689187a5cba19",
"54dade05b2324fe19a52a64be9029961",
"957cc851105f46a882da35e647610549",
"226ec5656834484ba97e606d2b0a2913",
"db36b50df77f40d0bc9d7ba605b30006",
"c570da8d73f440baaecc4d28ed2f172e",
"f62e532f460448f797fee8b62c48beaf",
"8fcb03020e4a4f17a3c10db5f9c0f4ce",
"3ac30d12448a4776968d6c6afbb95373",
"722f642c85f44485bb49bdbae30a9bb4",
"4f11c5da0ebc4f71aaffd36f0b9cf926",
"dcdf712cc5d24bdbac7b29cb6938eb50",
"08607b5313484ca281244515722ddbb6",
"ba7cf666ba864da585c7bf6d0069a021",
"8322dd29d22448d39d2bbd58e4eaa5b6",
"0efb1ff555b040a5ab33c4d44d691e59",
"da3642cb11394068a27c7242909780c1",
"6dd2f219748c433cb49427749a748807",
"4a035fa8367341c18ad3c8f90b95d4c8",
"260bdcd75f1b4258a992585113ee588f",
"3517d460e1a54b768c5aba83f826d9b5",
"bcb053042f9b495e87414f419b4f0e1b",
"0f0c9421c014416286e44e5229bcb31b",
"8dfa6904989841bbacc2c59b64ecb09b",
"49ec998705474f638f066ede6ca9fc63",
"b4801c75518d492eb894b895f990b300",
"40f85013f5664c689114758f5bece86b",
"50c25a3f033947da9375471a93b92b07",
"41dba62478274dac9bd00e68b24306b8"
]
},
"id": "-SeOmcfw-oZ9",
"outputId": "a3cdb5cf-0a48-4ac3-84bc-9f2e3f97fea7"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:104: UserWarning: \n",
"Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.\n",
"You are not authenticated with the Hugging Face Hub in this notebook.\n",
"If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).\n",
" warnings.warn(\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer_config.json: 0.00B [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "0b574b40930348f391f76cb710a5fef0"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"vocab.json: 0.00B [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "989777ad00b047a8aacb29720f2089e2"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"merges.txt: 0.00B [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "8fcb03020e4a4f17a3c10db5f9c0f4ce"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer.json: 0%| | 0.00/11.4M [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "4a035fa8367341c18ad3c8f90b95d4c8"
}
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"We can tokenizer our text:"
],
"metadata": {
"id": "bJ3pnVaq_OxO"
}
},
{
"cell_type": "code",
"source": [
"tokenizer.encode(text)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0DcoxqOK-9ty",
"outputId": "adb30cec-21cc-4f2c-8723-bafeb89b8c04"
},
"execution_count": 4,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[785, 5562, 49677, 279, 40676, 13]"
]
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "markdown",
"source": [
"This list of integers correspond to the token ids."
],
"metadata": {
"id": "ifzfMQL3_dBX"
}
},
{
"cell_type": "markdown",
"source": [
"You can encode other texts:"
],
"metadata": {
"id": "iEGW8QOn_nQg"
}
},
{
"cell_type": "code",
"source": [
"_text = \"Le chien mange les pommes.\"\n",
"tokenizer.encode(_text)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aqJ8_KoA_Vz3",
"outputId": "93b1df0f-4131-4fa6-874a-85455a3ade49"
},
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[2304, 521, 3591, 59434, 3541, 29484, 8828, 13]"
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "markdown",
"source": [
"You can also modify the `MODEL_ID` to see how the tokens change (search for other models in [HuggingFace](https://huggingface.co/models)). For example:"
],
"metadata": {
"id": "llCHblD7AGbN"
}
},
{
"cell_type": "code",
"source": [
"MISTRAL_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\" # if it doesn't work try \"MaziyarPanahi/Mistral-7B-v0.3\"\n",
"mistral_tokenizer = AutoTokenizer.from_pretrained(MISTRAL_MODEL_ID)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 145,
"referenced_widgets": [
"e3216531346f4d2a83607855b11ef48c",
"031f050b2b2845c086533089feaac1df",
"59f4c1c62c734840981fd438249dfd88",
"b87bd9c94c2d4fa5969ef49539355756",
"d64a1730b67a48dea897e462ce7cd3b0",
"13e4655bda7c4457910a8672fa56d53e",
"abec60ab0d7541a8bb86d448615dcc5b",
"a287d86221e748cbb80c9f0902008b82",
"c6b73a43593648f5ad6a652c2c1a2267",
"495cd8e87783407f89f7d058e0dd6e1e",
"4c1dec4b849e4db5bfd3fdc73912f59e",
"6870bd6f061345779f6971d6f23858f9",
"94a137889d3748138cfbd115cab5a1b9",
"ae7ef11818fb4062bf90bb1e23c6faaa",
"1ef9a88a75624d5fac59495231cf66cd",
"c2ec3d60a7d84334b55b6775c08b278b",
"36aa549f24434103b31d8619962fc2b3",
"d0dc7cc5ecb9486e9e3485eafd47eb4b",
"02203f7643f049a5932ab7ea8049999d",
"fdeade0d22aa42fdac1bbfcbdb22437b",
"11f821925f784f05871300de7dfb70f6",
"c40babea8e284b11ade33d7256e30d63",
"72ba5bc386494b54a00d6390610ae3ef",
"2a713f6e8b6746aba7435f99564384c8",
"5004480cc683446b8c6bd5f2f8e4e3fc",
"024c5d8848a2468c94609157f5b9915e",
"43f05329275840d081cc344b5884f8ff",
"1b17344f5c3945699d3fb51aba7059ce",
"3e68ed03e8ed44f8aa3636c7452c6777",
"8a0cb438d9ef4b01acc5de3124586ee8",
"581003d4da184e2d89ede83d553540e8",
"e7f889ce966d409b93a515a97abc51d1",
"426501375aa448f08017bf25fb357cfa",
"befc13c0530349f8a659b08a4eed4b5e",
"a64cac507c7449b08b38c83446d9fb22",
"7c0370e8bf12479287275a8fdfd7ac75",
"69754a6b775d478ca5edd2717d1d04cc",
"0e193233f20b4abca4b3c286a09492a0",
"4c2f9112ef884759ac6d8e7915139ddd",
"2d606887b67643b1b29a198b5964cfc0",
"801b377b5750415ab88c629c7eb42b28",
"f18644ff8add47bdb91e04f39efde5e5",
"e3a43c4674fc4529ae5c1f27a15b8588",
"ad13e3954d2c43c18ec1ddfc411a8302"
]
},
"id": "_u_1oSse_704",
"outputId": "072506c7-b0fe-4677-af79-91fd375524cc"
},
"execution_count": 6,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer_config.json: 0.00B [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "e3216531346f4d2a83607855b11ef48c"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer.model: 0%| | 0.00/587k [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "6870bd6f061345779f6971d6f23858f9"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer.json: 0.00B [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "72ba5bc386494b54a00d6390610ae3ef"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"special_tokens_map.json: 0%| | 0.00/414 [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "befc13c0530349f8a659b08a4eed4b5e"
}
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"We tokenize our text:"
],
"metadata": {
"id": "JRo4QB8ETCYf"
}
},
{
"cell_type": "code",
"source": [
"text = \"The dog eats the apples.\"\n",
"mistral_tokenizer.encode(text)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CBI2AxXpTPI7",
"outputId": "8270dab4-c4bb-4586-f1be-0442bc02242a"
},
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[1, 1183, 4682, 1085, 2217, 1040, 1747, 3583, 29491]"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "markdown",
"source": [
"Even if the text is the same, the tokens are different."
],
"metadata": {
"id": "B1PqxQv8U2PB"
}
},
{
"cell_type": "markdown",
"source": [
"We can do the reverse operation. We take the tokens and convert them to text:"
],
"metadata": {
"id": "uTEvQDKiVMQu"
}
},
{
"cell_type": "code",
"source": [
"text = \"The dog eats the apples.\"\n",
"token_ids = tokenizer.encode(text)\n",
"token_ids"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VM8IOoCJS1-2",
"outputId": "8899a90d-955e-4d72-9c4e-10ad2401483e"
},
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[785, 5562, 49677, 279, 40676, 13]"
]
},
"metadata": {},
"execution_count": 11
}
]
},
{
"cell_type": "code",
"source": [
"tokenizer.decode(token_ids)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 36
},
"id": "4orhUh5AVizd",
"outputId": "9e54de58-a3fc-4a53-b8f8-1cdf164fb93b"
},
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'The dog eats the apples.'"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "markdown",
"source": [
"Encoding a text and then decoding it should give the same original text."
],
"metadata": {
"id": "5L9Ojzk_V5Kc"
}
},
{
"cell_type": "markdown",
"source": [
"**Playing with tokenizers** reveal all sorts of interesting facts.\n",
"\n",
"Most common English words are assigned a single token. As demonstrated above:\n",
"\n",
"- \"The\": `785`\n",
"- \" dog\": `5562`\n",
"- \" eats\": `49677`\n",
"- \" the\": `279`\n",
"- \" apples\": `40676`\n",
"- \".\": `13`\n"
],
"metadata": {
"id": "sSNBz2bwWDgC"
}
},
{
"cell_type": "code",
"source": [
"text = \"The dog eats the apples.\"\n",
"token_ids = tokenizer.encode(text)\n",
"for token_id in token_ids:\n",
" print(f\"{tokenizer.decode(token_id).replace(\" \", \"_\")}: {token_id}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FQJ5qq1yd5d7",
"outputId": "1a5c7e31-ae3a-41eb-b591-dcc83abd3cdf"
},
"execution_count": 33,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"The: 785\n",
"_dog: 5562\n",
"_eats: 49677\n",
"_the: 279\n",
"_apples: 40676\n",
".: 13\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"\n",
"Capitalization is important: \"The\" with a capital T corresponds to token `785`, but \"the\" with lowercase is `1782` and \" the\" with both a leading space and a lowercase t is token `279`.\n",
"\n",
"Many words also have a token that incorporates a leading space. This makes for much more efficient encoding of full sentences, since they can be encoded without needing to spend a token on each whitespace character."
],
"metadata": {
"id": "SZlLITyTYZQU"
}
},
{
"cell_type": "markdown",
"source": [
"Numbers get their own tokens:\n",
"\n",
"- \"0\": `15`\n",
"- \"1\": `16`\n",
"- \"2\": `17`\n",
"- ...\n",
"- \"9\": `24`"
],
"metadata": {
"id": "CVTeVXe2YPmN"
}
},
{
"cell_type": "markdown",
"source": [
"Languages other than English suffer from less efficient tokenization."
],
"metadata": {
"id": "NT2aftGnYg0W"
}
},
{
"cell_type": "markdown",
"source": [
"\"Le chien mange les pommes.\" in French is encoded like this:\n",
"\n",
"- \"Le\": `2304`\n",
"- \" ch\": `521`\n",
"- \"ien\": `3591`\n",
"- \" mange\": `59434`\n",
"- \" les\": `3541`\n",
"- \" pom\": `29484`\n",
"- \" mes\": `8828`\n",
"- \".\" : `13`"
],
"metadata": {
"id": "Pffw4V24YHzO"
}
},
{
"cell_type": "code",
"source": [
"text = \"Le chien mange les pommes.\"\n",
"token_ids = tokenizer.encode(text)\n",
"for token_id in token_ids:\n",
" print(f\"{tokenizer.decode(token_id).replace(\" \", \"_\")}: {token_id}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "t-wx37_FYkfB",
"outputId": "51e1ce6d-c80c-408f-bc88-6e682e53544f"
},
"execution_count": 31,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Le: 2304\n",
"_ch: 521\n",
"ien: 3591\n",
"_mange: 59434\n",
"_les: 3541\n",
"_pom: 29484\n",
"mes: 8828\n",
".: 13\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"\"Il cane mangia le mele.\" in Italian is encoded like this:\n",
"\n",
"- \"Il\": `12050`\n",
"- \" cane\": `62235`\n",
"- \" mang\": `50196`\n",
"- \"ia\": `685`\n",
"- \" le\": `512`\n",
"- \" me\": `752`\n",
"- \"le\": `273`\n",
"- \".\" : `13`"
],
"metadata": {
"id": "R6PbOJMfZgqu"
}
},
{
"cell_type": "code",
"source": [
"text = \"Il cane mangia le mele.\"\n",
"token_ids = tokenizer.encode(text)\n",
"for token_id in token_ids:\n",
" print(f\"{tokenizer.decode(token_id).replace(\" \", \"_\")}: {token_id}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_skkUki-ZQ6T",
"outputId": "9202aa55-d0cf-4efd-b71b-d3a7a50325e7"
},
"execution_count": 32,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Il: 12050\n",
"_cane: 62235\n",
"_mang: 50196\n",
"ia: 685\n",
"_le: 512\n",
"_me: 752\n",
"le: 273\n",
".: 13\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"The majority of tokenizers are trained with the [byte-pair encoding algorithm](https://en.wikipedia.org/wiki/Byte-pair_encoding).\n",
"\n",
"We can obtain the set of the tokens obtained by the byte-pair encoding algorithm:"
],
"metadata": {
"id": "Zr0s1pFUXAqm"
}
},
{
"cell_type": "code",
"source": [
"tokenizer.vocab_size"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Q0KBkUq_VpnM",
"outputId": "8010fed1-5849-422e-d6e9-f04502b5c848"
},
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"151643"
]
},
"metadata": {},
"execution_count": 16
}
]
},
{
"cell_type": "markdown",
"source": [
"The set of all tokens can be obtained:"
],
"metadata": {
"id": "IRL5H6JsXtOE"
}
},
{
"cell_type": "code",
"source": [
"len(tokenizer)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "k90EgleGWUPG",
"outputId": "75011ee9-07e8-4eb8-dca5-21b7657f0890"
},
"execution_count": 17,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"151669"
]
},
"metadata": {},
"execution_count": 17
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"df = pd.DataFrame()\n",
"df[\"token_id\"] = range(len(tokenizer))\n",
"df[\"token\"] = [tokenizer.decode([i]) for i in range(len(tokenizer))]\n",
"df.head(20).style.hide(axis=\"index\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 676
},
"id": "7cNodk2TeVvf",
"outputId": "02f6ba84-5360-4919-8226-cdaf66b51d5e"
},
"execution_count": 36,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<pandas.io.formats.style.Styler at 0x7bb3693ad4c0>"
],
"text/html": [
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_643c8\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th id=\"T_643c8_level0_col0\" class=\"col_heading level0 col0\" >token_id</th>\n",
" <th id=\"T_643c8_level0_col1\" class=\"col_heading level0 col1\" >token</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td id=\"T_643c8_row0_col0\" class=\"data row0 col0\" >0</td>\n",
" <td id=\"T_643c8_row0_col1\" class=\"data row0 col1\" >!</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row1_col0\" class=\"data row1 col0\" >1</td>\n",
" <td id=\"T_643c8_row1_col1\" class=\"data row1 col1\" >\"</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row2_col0\" class=\"data row2 col0\" >2</td>\n",
" <td id=\"T_643c8_row2_col1\" class=\"data row2 col1\" >#</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row3_col0\" class=\"data row3 col0\" >3</td>\n",
" <td id=\"T_643c8_row3_col1\" class=\"data row3 col1\" >$</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row4_col0\" class=\"data row4 col0\" >4</td>\n",
" <td id=\"T_643c8_row4_col1\" class=\"data row4 col1\" >%</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row5_col0\" class=\"data row5 col0\" >5</td>\n",
" <td id=\"T_643c8_row5_col1\" class=\"data row5 col1\" >&</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row6_col0\" class=\"data row6 col0\" >6</td>\n",
" <td id=\"T_643c8_row6_col1\" class=\"data row6 col1\" >'</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row7_col0\" class=\"data row7 col0\" >7</td>\n",
" <td id=\"T_643c8_row7_col1\" class=\"data row7 col1\" >(</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row8_col0\" class=\"data row8 col0\" >8</td>\n",
" <td id=\"T_643c8_row8_col1\" class=\"data row8 col1\" >)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row9_col0\" class=\"data row9 col0\" >9</td>\n",
" <td id=\"T_643c8_row9_col1\" class=\"data row9 col1\" >*</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row10_col0\" class=\"data row10 col0\" >10</td>\n",
" <td id=\"T_643c8_row10_col1\" class=\"data row10 col1\" >+</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row11_col0\" class=\"data row11 col0\" >11</td>\n",
" <td id=\"T_643c8_row11_col1\" class=\"data row11 col1\" >,</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row12_col0\" class=\"data row12 col0\" >12</td>\n",
" <td id=\"T_643c8_row12_col1\" class=\"data row12 col1\" >-</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row13_col0\" class=\"data row13 col0\" >13</td>\n",
" <td id=\"T_643c8_row13_col1\" class=\"data row13 col1\" >.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row14_col0\" class=\"data row14 col0\" >14</td>\n",
" <td id=\"T_643c8_row14_col1\" class=\"data row14 col1\" >/</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row15_col0\" class=\"data row15 col0\" >15</td>\n",
" <td id=\"T_643c8_row15_col1\" class=\"data row15 col1\" >0</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row16_col0\" class=\"data row16 col0\" >16</td>\n",
" <td id=\"T_643c8_row16_col1\" class=\"data row16 col1\" >1</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row17_col0\" class=\"data row17 col0\" >17</td>\n",
" <td id=\"T_643c8_row17_col1\" class=\"data row17 col1\" >2</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row18_col0\" class=\"data row18 col0\" >18</td>\n",
" <td id=\"T_643c8_row18_col1\" class=\"data row18 col1\" >3</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_643c8_row19_col0\" class=\"data row19 col0\" >19</td>\n",
" <td id=\"T_643c8_row19_col1\" class=\"data row19 col1\" >4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
]
},
"metadata": {},
"execution_count": 36
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment