Skip to content

Instantly share code, notes, and snippets.

@p208p2002
Last active February 22, 2024 01:09
Show Gist options
  • Select an option

  • Save p208p2002/adf1d4235b9567227d01315beb4b210e to your computer and use it in GitHub Desktop.

Select an option

Save p208p2002/adf1d4235b9567227d01315beb4b210e to your computer and use it in GitHub Desktop.
fix_chatglm_tokenizer.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyOPKPuHs7Qt6P83QIJIl8Y3",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"8fc8ba01fa094db1adf7c120cbbce6f9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_c50dc3f195474640aa8b514719f1b578",
"IPY_MODEL_be40ede4ecc94fef81a79e23485172f8",
"IPY_MODEL_b95efb4c53994317909a0aeb42f605c6"
],
"layout": "IPY_MODEL_6663767588c64625ad7c8fc81f1b76ac"
}
},
"c50dc3f195474640aa8b514719f1b578": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_de54e2933f61453cb317f5043cb7a536",
"placeholder": "​",
"style": "IPY_MODEL_35fc132ba0a14a6297fb78e91d9bf2b4",
"value": "tokenizer_config.json: 100%"
}
},
"be40ede4ecc94fef81a79e23485172f8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_076a73f22cfc40f9afb4dc74ca450803",
"max": 518,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_2a55f8e77dba42eebdbe806cb34df623",
"value": 518
}
},
"b95efb4c53994317909a0aeb42f605c6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b9ea7ad532c143c39f7aa369dc2d753a",
"placeholder": "​",
"style": "IPY_MODEL_8accd65e27b84ea2b7d52604707d5e8b",
"value": " 518/518 [00:00<00:00, 1.33kB/s]"
}
},
"6663767588c64625ad7c8fc81f1b76ac": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"de54e2933f61453cb317f5043cb7a536": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"35fc132ba0a14a6297fb78e91d9bf2b4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"076a73f22cfc40f9afb4dc74ca450803": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"2a55f8e77dba42eebdbe806cb34df623": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"b9ea7ad532c143c39f7aa369dc2d753a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"8accd65e27b84ea2b7d52604707d5e8b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"51123fb3424c48cb89ea5ec161da7212": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_a72783ee830e4c4eb756cc0edb15b14c",
"IPY_MODEL_7edec9f615794365b49c2c5d4a1da4fe",
"IPY_MODEL_82f8a511328f428487f16b68065ad1c2"
],
"layout": "IPY_MODEL_1a4db688377445c4b5179d9e2b8f1820"
}
},
"a72783ee830e4c4eb756cc0edb15b14c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_0fbc9c6e3e6f4e95b3187688b4f8ab12",
"placeholder": "​",
"style": "IPY_MODEL_4b00803786d44c0f8a4b98ac32ed794a",
"value": "tokenization_chatglm.py: 100%"
}
},
"7edec9f615794365b49c2c5d4a1da4fe": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c4e745d8c08e49f099a97a7f006c314b",
"max": 12998,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_55d0230a396845a28668ec8df80b1396",
"value": 12998
}
},
"82f8a511328f428487f16b68065ad1c2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_8802d289102b48fe8c010d631a1e364d",
"placeholder": "​",
"style": "IPY_MODEL_2c7ccd6c38e242e98d8f98bf2e97b7bb",
"value": " 13.0k/13.0k [00:00<00:00, 309kB/s]"
}
},
"1a4db688377445c4b5179d9e2b8f1820": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0fbc9c6e3e6f4e95b3187688b4f8ab12": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4b00803786d44c0f8a4b98ac32ed794a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"c4e745d8c08e49f099a97a7f006c314b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"55d0230a396845a28668ec8df80b1396": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"8802d289102b48fe8c010d631a1e364d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"2c7ccd6c38e242e98d8f98bf2e97b7bb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"2864ec47e7f74f68abec73b447d4a0a8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_b1168308ad1b4276b3595d537e552784",
"IPY_MODEL_8d67a6fb824e4928ab2bb43a8d158067",
"IPY_MODEL_bb19057e154c4cdca8a756fc6689dbe0"
],
"layout": "IPY_MODEL_64a05f7232a144d2a6deb914a4093d70"
}
},
"b1168308ad1b4276b3595d537e552784": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b4fdf2dd42a64080beae9b2f6cc9f105",
"placeholder": "​",
"style": "IPY_MODEL_a1ef1ea38b254de1962c9a7e1133795b",
"value": "tokenizer.model: 100%"
}
},
"8d67a6fb824e4928ab2bb43a8d158067": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_63516e4c953045aeb3de514ca61170ae",
"max": 1018370,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_0fb9ac5f385f4927982269659ee473ec",
"value": 1018370
}
},
"bb19057e154c4cdca8a756fc6689dbe0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_99bc50c7d4fc4661934e242774c15e78",
"placeholder": "​",
"style": "IPY_MODEL_608d7e2902754cb6bb805d0607c4fede",
"value": " 1.02M/1.02M [00:00<00:00, 10.0MB/s]"
}
},
"64a05f7232a144d2a6deb914a4093d70": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b4fdf2dd42a64080beae9b2f6cc9f105": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"a1ef1ea38b254de1962c9a7e1133795b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"63516e4c953045aeb3de514ca61170ae": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0fb9ac5f385f4927982269659ee473ec": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"99bc50c7d4fc4661934e242774c15e78": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"608d7e2902754cb6bb805d0607c4fede": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/p208p2002/adf1d4235b9567227d01315beb4b210e/fix_chatglm_tokenizer.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"from transformers import AutoTokenizer\n",
"from tokenizers import AddedToken"
],
"metadata": {
"id": "AjQFmx-Ywzz7"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# chatglm official method input_ids\n",
"tokenizer = AutoTokenizer.from_pretrained(\"THUDM/chatglm3-6b\",trust_remote_code = True)\n",
"chatglm_official_input_ids = tokenizer.build_chat_input(\"hi\", [])[\"input_ids\"][0].tolist()\n",
"chatglm_official_input_tokens = tokenizer.convert_ids_to_tokens(chatglm_official_input_ids)\n",
"print(f\"{chatglm_official_input_ids=}\")\n",
"print(f\"{chatglm_official_input_tokens=}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 336,
"referenced_widgets": [
"8fc8ba01fa094db1adf7c120cbbce6f9",
"c50dc3f195474640aa8b514719f1b578",
"be40ede4ecc94fef81a79e23485172f8",
"b95efb4c53994317909a0aeb42f605c6",
"6663767588c64625ad7c8fc81f1b76ac",
"de54e2933f61453cb317f5043cb7a536",
"35fc132ba0a14a6297fb78e91d9bf2b4",
"076a73f22cfc40f9afb4dc74ca450803",
"2a55f8e77dba42eebdbe806cb34df623",
"b9ea7ad532c143c39f7aa369dc2d753a",
"8accd65e27b84ea2b7d52604707d5e8b",
"51123fb3424c48cb89ea5ec161da7212",
"a72783ee830e4c4eb756cc0edb15b14c",
"7edec9f615794365b49c2c5d4a1da4fe",
"82f8a511328f428487f16b68065ad1c2",
"1a4db688377445c4b5179d9e2b8f1820",
"0fbc9c6e3e6f4e95b3187688b4f8ab12",
"4b00803786d44c0f8a4b98ac32ed794a",
"c4e745d8c08e49f099a97a7f006c314b",
"55d0230a396845a28668ec8df80b1396",
"8802d289102b48fe8c010d631a1e364d",
"2c7ccd6c38e242e98d8f98bf2e97b7bb",
"2864ec47e7f74f68abec73b447d4a0a8",
"b1168308ad1b4276b3595d537e552784",
"8d67a6fb824e4928ab2bb43a8d158067",
"bb19057e154c4cdca8a756fc6689dbe0",
"64a05f7232a144d2a6deb914a4093d70",
"b4fdf2dd42a64080beae9b2f6cc9f105",
"a1ef1ea38b254de1962c9a7e1133795b",
"63516e4c953045aeb3de514ca61170ae",
"0fb9ac5f385f4927982269659ee473ec",
"99bc50c7d4fc4661934e242774c15e78",
"608d7e2902754cb6bb805d0607c4fede"
]
},
"id": "Bl04u95n5AGz",
"outputId": "08089504-9017-482e-a837-d81ef8884d49"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
"You will be able to reuse this secret in all of your notebooks.\n",
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
" warnings.warn(\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer_config.json: 0%| | 0.00/518 [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "8fc8ba01fa094db1adf7c120cbbce6f9"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenization_chatglm.py: 0%| | 0.00/13.0k [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "51123fb3424c48cb89ea5ec161da7212"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm3-6b:\n",
"- tokenization_chatglm.py\n",
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer.model: 0%| | 0.00/1.02M [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "2864ec47e7f74f68abec73b447d4a0a8"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"chatglm_official_input_ids=[64790, 64792, 64795, 30910, 13, 14980, 64796]\n",
"chatglm_official_input_tokens=['[gMASK]', 'sop', '<|user|>', '▁', '<0x0A>', '▁hi', '<|assistant|>']\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# current tokenizer with apply_chat_template\n",
"tokenizer = AutoTokenizer.from_pretrained(\"THUDM/chatglm3-6b\",trust_remote_code = True)\n",
"current_version_input_ids = tokenizer.apply_chat_template(\n",
" conversation=[\n",
" {\"role\": \"user\", \"content\":\"hi\"},\n",
" ],\n",
" add_generation_prompt=True\n",
")\n",
"current_version_input_tokens = tokenizer.convert_ids_to_tokens(current_version_input_ids)\n",
"print(f\"{current_version_input_ids}\")\n",
"print(f\"{current_version_input_tokens}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Hd2BlLXj5V2Z",
"outputId": "2feedcfc-0080-4f21-c6f1-53e69cc82c7e"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[790, 30927, 30944, 2080, 30984, 30996, 30917, 404, 31002, 31007, 4865, 31007, 30994, 30910, 13, 14980, 31002, 31007, 530, 18971, 31007, 30994]\n",
"['▁[', 'g', 'M', 'AS', 'K', ']', 's', 'op', '<', '|', 'user', '|', '>', '▁', '<0x0A>', '▁hi', '<', '|', 'ass', 'istant', '|', '>']\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# tokenizer after add tokens with apply_chat_template\n",
"tokenizer = AutoTokenizer.from_pretrained(\"THUDM/chatglm3-6b\",trust_remote_code = True)\n",
"\n",
"# fix the chat_template with extra white space ↓\n",
"tokenizer.chat_template = \"\"\"\n",
"{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n",
" {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n",
" {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}\n",
"\"\"\".strip()\n",
"\n",
"tokenizer.add_tokens(AddedToken(\"<|user|>\"))\n",
"tokenizer.add_tokens(\"<|assistant|>\")\n",
"tokenizer.add_tokens(\"[gMASK]\")\n",
"tokenizer.add_tokens(\"sop\")\n",
"\n",
"add_token_input_ids = tokenizer.apply_chat_template(\n",
" conversation=[\n",
" {\"role\": \"user\", \"content\":\"hi\"},\n",
" ],\n",
" add_generation_prompt=True\n",
")\n",
"\n",
"add_token_input_tokens = tokenizer.convert_ids_to_tokens(add_token_input_ids)\n",
"\n",
"print(f\"{add_token_input_ids=}\")\n",
"print(f\"{add_token_input_tokens}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "W7NjF4Qqw9jU",
"outputId": "88078a04-e2cd-4240-e5c6-766b12b6c2aa"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"add_token_input_ids=[64790, 64792, 64795, 30910, 13, 14980, 64796]\n",
"['[gMASK]', 'sop', '<|user|>', '▁', '<0x0A>', '▁hi', '<|assistant|>']\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"tokenizer.save_pretrained(\"fix_chatglm3_tokenizer\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dNGANOX6_o5D",
"outputId": "b95fc5b1-992f-4749-b38f-160dcb93ecba"
},
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('fix_chatglm3_tokenizer/tokenizer_config.json',\n",
" 'fix_chatglm3_tokenizer/special_tokens_map.json',\n",
" 'fix_chatglm3_tokenizer/tokenizer.model',\n",
" 'fix_chatglm3_tokenizer/added_tokens.json')"
]
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"source": [
"assert str(chatglm_official_input_ids) == str(add_token_input_ids)"
],
"metadata": {
"id": "EF6Z-YT77TbK"
},
"execution_count": 6,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment