Last active
August 16, 2021 05:42
-
-
Save Beomi/972c6442a9c15a22dfd1903d0bb0f577 to your computer and use it in GitHub Desktop.
2021.03.15. KcBERT MLM Finetune with Petition Dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "2021.03.15. KcBERT MLM Finetune with Petition Dataset", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "toc_visible": true, | |
| "machine_shape": "hm", | |
| "authorship_tag": "ABX9TyPThxv5ZZ2ERKRzbL6DStPo", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "accelerator": "GPU", | |
| "widgets": { | |
| "application/vnd.jupyter.widget-state+json": { | |
| "24b919ba13f34f6aa148dc6435e450d5": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_e184b36746744b1ea8faf0fece80a377", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_7f1d80c730ca40f1ab8609986f9b7079", | |
| "IPY_MODEL_72aad332eb6b4534b8d9bce460905849", | |
| "IPY_MODEL_41e24ce51d304a89bffe7b7be661ba54" | |
| ] | |
| } | |
| }, | |
| "e184b36746744b1ea8faf0fece80a377": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "7f1d80c730ca40f1ab8609986f9b7079": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_100c672a20e7416bacbdecf6a7417111", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": "100%", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_ece5002e3e064a73ba3be461907b4a9e" | |
| } | |
| }, | |
| "72aad332eb6b4534b8d9bce460905849": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_ef854ae062a3411aa407fc8ef36c2fa9", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 20, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 20, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_e69ee077a90e4d93852685ec424f6b1f" | |
| } | |
| }, | |
| "41e24ce51d304a89bffe7b7be661ba54": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_12837fed7e2a46fea6eb1b2175c35c2f", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 20/20 [00:09<00:00, 2.08it/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_00cebb6154394ee884baf32814059c1e" | |
| } | |
| }, | |
| "100c672a20e7416bacbdecf6a7417111": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "ece5002e3e064a73ba3be461907b4a9e": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "ef854ae062a3411aa407fc8ef36c2fa9": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "e69ee077a90e4d93852685ec424f6b1f": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "12837fed7e2a46fea6eb1b2175c35c2f": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "00cebb6154394ee884baf32814059c1e": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "a619d5c077494c628bfd589032f0057b": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_b687225b255a4b0a9945fb5a5b219f48", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_ec0bea3e70644604b3e8253d0d2ae528", | |
| "IPY_MODEL_bb0da81e63fb4b1b8f29d0b80d86248d", | |
| "IPY_MODEL_211d736187bf49dd930bf6c61e3619b6" | |
| ] | |
| } | |
| }, | |
| "b687225b255a4b0a9945fb5a5b219f48": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "ec0bea3e70644604b3e8253d0d2ae528": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_9e88a788fead47a1a90af692d94942ff", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 0%", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_3b7c4bc39fca494dba816c92c0f32cc8" | |
| } | |
| }, | |
| "bb0da81e63fb4b1b8f29d0b80d86248d": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_fb9c04093d6d4850b5a8f35da73ecb56", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "danger", | |
| "max": 3704, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 13, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_91e2e116a06849079b045fe5e916c54f" | |
| } | |
| }, | |
| "211d736187bf49dd930bf6c61e3619b6": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_36bc81bec3644b8b99f98524eb7492b1", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 13/3704 [00:01<13:01, 4.72it/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_7713dc447df04ebe8c42fbe717a3767b" | |
| } | |
| }, | |
| "9e88a788fead47a1a90af692d94942ff": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "3b7c4bc39fca494dba816c92c0f32cc8": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "fb9c04093d6d4850b5a8f35da73ecb56": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "91e2e116a06849079b045fe5e916c54f": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "36bc81bec3644b8b99f98524eb7492b1": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "7713dc447df04ebe8c42fbe717a3767b": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/Beomi/972c6442a9c15a22dfd1903d0bb0f577/2021-03-15-kcbert-mlm-finetune-with-petition-dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "XxaDw3JIXrDf" | |
| }, | |
| "source": [ | |
| "# 필요한 패키지 설치\n", | |
| "\n", | |
| "- Korpora: 데이터셋 다운로드\n", | |
| "- emoji: 이모지코드\n", | |
| "- soynlp: Preprocesisng\n", | |
| "- kss: 한국어 문장 분리기\n", | |
| "- transformers: MLM 학습 및 데이터셋\n", | |
| " - datasets\n", | |
| " - protobuf\n", | |
| " - sentencepiece" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "lqA2SU2sWqjR", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "a592d3c8-0d98-4871-ba23-41bf6bd95c12" | |
| }, | |
| "source": [ | |
| "!pip install -q Korpora emoji soynlp \"kss<2.6\" transformers \"datasets >= 1.1.3\" \"sentencepiece != 0.1.92\" protobuf" | |
| ], | |
| "execution_count": 26, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\u001b[?25l\r\u001b[K |█████ | 10 kB 31.3 MB/s eta 0:00:01\r\u001b[K |██████████ | 20 kB 18.5 MB/s eta 0:00:01\r\u001b[K |███████████████ | 30 kB 11.0 MB/s eta 0:00:01\r\u001b[K |███████████████████▉ | 40 kB 8.8 MB/s eta 0:00:01\r\u001b[K |████████████████████████▉ | 51 kB 5.0 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▉ | 61 kB 5.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 65 kB 2.5 MB/s \n", | |
| "\u001b[?25h" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "0nAbFcVFXzV4" | |
| }, | |
| "source": [ | |
| "# 예시용 데이터셋 다운로드\n", | |
| "\n", | |
| "- 여기서는 Korean petitions dataset(국민청원 데이터셋)을 사용\n", | |
| "- 전체 중 동의 수가 1000건 초과인 본문만 사용" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "3Z7ji3QnW7JG" | |
| }, | |
| "source": [ | |
| "from Korpora import Korpora" | |
| ], | |
| "execution_count": 27, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "erTvWvZCW91J", | |
| "outputId": "54739e2f-aa45-48ab-bcf4-859005a8ed0b" | |
| }, | |
| "source": [ | |
| "Korpora.fetch('korean_petitions', root_dir='./Korpora')" | |
| ], | |
| "execution_count": 28, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-08\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-09\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-10\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-11\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-12\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-01\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-02\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-03\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-04\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-05\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-06\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-07\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-08\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-09\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-10\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-11\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-12\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-01\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-02\n", | |
| "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-03\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Gk4EXcX3W-LO" | |
| }, | |
| "source": [ | |
| "from glob import glob" | |
| ], | |
| "execution_count": 29, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "XPQDC74FXAxb", | |
| "outputId": "85fbe915-26c5-45f0-d91a-1a3835d00e21" | |
| }, | |
| "source": [ | |
| "dataset = glob('./Korpora/korean_petitions/petitions*')\n", | |
| "dataset" | |
| ], | |
| "execution_count": 30, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "['./Korpora/korean_petitions/petitions_2018-06',\n", | |
| " './Korpora/korean_petitions/petitions_2018-05',\n", | |
| " './Korpora/korean_petitions/petitions_2018-01',\n", | |
| " './Korpora/korean_petitions/petitions_2017-11',\n", | |
| " './Korpora/korean_petitions/petitions_2017-09',\n", | |
| " './Korpora/korean_petitions/petitions_2018-10',\n", | |
| " './Korpora/korean_petitions/petitions_2019-03',\n", | |
| " './Korpora/korean_petitions/petitions_2019-02',\n", | |
| " './Korpora/korean_petitions/petitions_2018-03',\n", | |
| " './Korpora/korean_petitions/petitions_2017-08',\n", | |
| " './Korpora/korean_petitions/petitions_2019-01',\n", | |
| " './Korpora/korean_petitions/petitions_2018-11',\n", | |
| " './Korpora/korean_petitions/petitions_2018-09',\n", | |
| " './Korpora/korean_petitions/petitions_2018-07',\n", | |
| " './Korpora/korean_petitions/petitions_2017-10',\n", | |
| " './Korpora/korean_petitions/petitions_2018-12',\n", | |
| " './Korpora/korean_petitions/petitions_2018-02',\n", | |
| " './Korpora/korean_petitions/petitions_2018-08',\n", | |
| " './Korpora/korean_petitions/petitions_2018-04',\n", | |
| " './Korpora/korean_petitions/petitions_2017-12']" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 30 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "Ssdr7HBMX8Xo" | |
| }, | |
| "source": [ | |
| "# 데이터 로딩\n", | |
| "\n", | |
| "- pandas로 `content` 부분만 읽어 파일로 만들기\n", | |
| "- kss로 각 청원 게시글 내 문장 분리 \n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "eS_NOWeWXB-x" | |
| }, | |
| "source": [ | |
| "import pandas as pd\n", | |
| "from tqdm.auto import tqdm" | |
| ], | |
| "execution_count": 31, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 49, | |
| "referenced_widgets": [ | |
| "24b919ba13f34f6aa148dc6435e450d5", | |
| "e184b36746744b1ea8faf0fece80a377", | |
| "7f1d80c730ca40f1ab8609986f9b7079", | |
| "72aad332eb6b4534b8d9bce460905849", | |
| "41e24ce51d304a89bffe7b7be661ba54", | |
| "100c672a20e7416bacbdecf6a7417111", | |
| "ece5002e3e064a73ba3be461907b4a9e", | |
| "ef854ae062a3411aa407fc8ef36c2fa9", | |
| "e69ee077a90e4d93852685ec424f6b1f", | |
| "12837fed7e2a46fea6eb1b2175c35c2f", | |
| "00cebb6154394ee884baf32814059c1e" | |
| ] | |
| }, | |
| "id": "oXnOcNi8XDC8", | |
| "outputId": "0a3b73c5-973a-44bd-899b-5fe559d63523" | |
| }, | |
| "source": [ | |
| "df = pd.concat([pd.read_json(i, lines=True) for i in tqdm(dataset)])" | |
| ], | |
| "execution_count": 32, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "24b919ba13f34f6aa148dc6435e450d5", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| " 0%| | 0/20 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 204 | |
| }, | |
| "id": "yH94XSQfXEHU", | |
| "outputId": "6154d041-bc63-4f8f-dc76-082c0e1bc94b" | |
| }, | |
| "source": [ | |
| "df.head()" | |
| ], | |
| "execution_count": 33, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>category</th>\n", | |
| " <th>begin</th>\n", | |
| " <th>end</th>\n", | |
| " <th>content</th>\n", | |
| " <th>num_agree</th>\n", | |
| " <th>petition_idx</th>\n", | |
| " <th>status</th>\n", | |
| " <th>title</th>\n", | |
| " <th>replies</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>미래</td>\n", | |
| " <td>2018-06-01</td>\n", | |
| " <td>2018-07-01</td>\n", | |
| " <td>중국과 5년간 일때문에 교류를 하면서, 통역을 맡은 아가씨들이 모두 조선족아가씨 였...</td>\n", | |
| " <td>9</td>\n", | |
| " <td>257860</td>\n", | |
| " <td>청원종료</td>\n", | |
| " <td>조선족은 중국사람 입니다!</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>기타</td>\n", | |
| " <td>2018-06-01</td>\n", | |
| " <td>2018-07-01</td>\n", | |
| " <td>안녕하세요 저는 18세 남자입니다 요즘 페북이나 청와대 홈페이지를 통해 청원들을 보...</td>\n", | |
| " <td>3</td>\n", | |
| " <td>257861</td>\n", | |
| " <td>청원종료</td>\n", | |
| " <td>청원의 문제점</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>문화/예술/체육/언론</td>\n", | |
| " <td>2018-06-01</td>\n", | |
| " <td>2018-07-01</td>\n", | |
| " <td>어제 오늘 중계보고 있으려니 속터지네요 감독작전도 무대책 ᆢ리시브 기본도 없는 선수...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>257862</td>\n", | |
| " <td>청원종료</td>\n", | |
| " <td>여자대표 철수해주세요</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>정치개혁</td>\n", | |
| " <td>2018-06-01</td>\n", | |
| " <td>2018-07-01</td>\n", | |
| " <td>대통령님 덕분에 우리나라가 좋아졌다는걸 느껴요 항상 국민 옆에 계셔야 해요!</td>\n", | |
| " <td>0</td>\n", | |
| " <td>257863</td>\n", | |
| " <td>청원종료</td>\n", | |
| " <td>문대통령님 오랫동안 대통령하시면 안될까요?</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>육아/교육</td>\n", | |
| " <td>2018-06-01</td>\n", | |
| " <td>2018-07-01</td>\n", | |
| " <td>급식이 너무 맛이 없고 가격도 비싸서 재원생의 원성이 자자합니다. 알고보니 급식업체...</td>\n", | |
| " <td>34</td>\n", | |
| " <td>257864</td>\n", | |
| " <td>청원종료</td>\n", | |
| " <td>시대인재학원 급식 좀 맛있게 해주세요</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " category begin end ... status title replies\n", | |
| "0 미래 2018-06-01 2018-07-01 ... 청원종료 조선족은 중국사람 입니다! NaN\n", | |
| "1 기타 2018-06-01 2018-07-01 ... 청원종료 청원의 문제점 NaN\n", | |
| "2 문화/예술/체육/언론 2018-06-01 2018-07-01 ... 청원종료 여자대표 철수해주세요 NaN\n", | |
| "3 정치개혁 2018-06-01 2018-07-01 ... 청원종료 문대통령님 오랫동안 대통령하시면 안될까요? NaN\n", | |
| "4 육아/교육 2018-06-01 2018-07-01 ... 청원종료 시대인재학원 급식 좀 맛있게 해주세요 NaN\n", | |
| "\n", | |
| "[5 rows x 9 columns]" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 33 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "_EmRjLbOXFHX", | |
| "outputId": "3b8fe782-d0ce-450e-aabe-8fde6ce995c5" | |
| }, | |
| "source": [ | |
| "len(df)" | |
| ], | |
| "execution_count": 34, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "433631" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 34 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "PZAqRMwBXG1i" | |
| }, | |
| "source": [ | |
| "agreed_df = df[df['num_agree'] > 1000]" | |
| ], | |
| "execution_count": 35, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "O5eeT1UZXUUn", | |
| "outputId": "5fadd314-f8db-4145-8253-a1b1f574a50e" | |
| }, | |
| "source": [ | |
| "len(agreed_df)" | |
| ], | |
| "execution_count": 36, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "3704" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 36 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "1T8OvUwDXVM3" | |
| }, | |
| "source": [ | |
| "import re\n", | |
| "import emoji\n", | |
| "from soynlp.normalizer import repeat_normalize\n", | |
| "\n", | |
| "emojis = ''.join(emoji.UNICODE_EMOJI.keys())\n", | |
| "pattern = re.compile(f'[^ .,?!/@$%~%·∼()\\x00-\\x7Fㄱ-ㅣ가-힣{emojis}]+')\n", | |
| "url_pattern = re.compile(\n", | |
| " r'https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*)')\n", | |
| "\n", | |
| "def clean(x):\n", | |
| " x = pattern.sub(' ', x)\n", | |
| " x = url_pattern.sub('', x)\n", | |
| " x = x.strip()\n", | |
| " x = repeat_normalize(x, num_repeats=2)\n", | |
| " return x" | |
| ], | |
| "execution_count": 37, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "z59q9G4gXWVh" | |
| }, | |
| "source": [ | |
| "contents = agreed_df['content'].map(clean).to_list()" | |
| ], | |
| "execution_count": 38, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "V-VI74f3XXV-" | |
| }, | |
| "source": [ | |
| "from kss import split_sentences" | |
| ], | |
| "execution_count": 39, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "fJtIEVBpY58Z" | |
| }, | |
| "source": [ | |
| "import os" | |
| ], | |
| "execution_count": 40, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "0H8sIa3CehbT", | |
| "outputId": "c84a9404-d592-4d16-a21e-9a175c8ccec1", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| } | |
| }, | |
| "source": [ | |
| "split_sentences(\"안녕하세요. 오늘은 날씨가 좋더라구요.\")" | |
| ], | |
| "execution_count": 41, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "['안녕하세요.', '오늘은 날씨가 좋더라구요.']" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 41 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 322, | |
| "referenced_widgets": [ | |
| "a619d5c077494c628bfd589032f0057b", | |
| "b687225b255a4b0a9945fb5a5b219f48", | |
| "ec0bea3e70644604b3e8253d0d2ae528", | |
| "bb0da81e63fb4b1b8f29d0b80d86248d", | |
| "211d736187bf49dd930bf6c61e3619b6", | |
| "9e88a788fead47a1a90af692d94942ff", | |
| "3b7c4bc39fca494dba816c92c0f32cc8", | |
| "fb9c04093d6d4850b5a8f35da73ecb56", | |
| "91e2e116a06849079b045fe5e916c54f", | |
| "36bc81bec3644b8b99f98524eb7492b1", | |
| "7713dc447df04ebe8c42fbe717a3767b" | |
| ] | |
| }, | |
| "id": "i0RriVk2XY05", | |
| "outputId": "ef307a1d-123c-4682-ae72-d2d4d04ddde7" | |
| }, | |
| "source": [ | |
| "with open('korean_petitions_safe.txt', 'w') as f:\n", | |
| " for doc in tqdm(contents):\n", | |
| " if doc:\n", | |
| " for line in split_sentences(doc):\n", | |
| " f.write(line+'\\n')\n", | |
| " f.write('\\n')\n", | |
| " f.close()" | |
| ], | |
| "execution_count": 43, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "a619d5c077494c628bfd589032f0057b", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| " 0%| | 0/3704 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| }, | |
| { | |
| "output_type": "error", | |
| "ename": "TypeError", | |
| "evalue": "ignored", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[0;32m<ipython-input-43-c1df52e7c6de>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdoc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msplit_sentences\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/kss/kss.py\u001b[0m in \u001b[0;36msplit_sentences\u001b[0;34m(text, use_heuristic, max_recover_step, max_recover_length, ignore_quotes_or_brackets)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ms\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mneed_to_replace_zwsp\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"\\u200b{s}\\u200b\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 163\u001b[0m \u001b[0mprev_1\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0mprev_2\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/kss/kss.py\u001b[0m in \u001b[0;36m_split_sentences\u001b[0;34m(text, use_heuristic, max_recover_step, max_recover_length, ignore_quotes_or_brackets, recover_step)\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0mcur_sentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdo_trim_sent_push_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcur_sentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 323\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mTable\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcur_stat\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mprev_1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0mID\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNEXT1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 324\u001b[0m \u001b[0mcur_sentence\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mprev_1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[0mdo_trim_sent_push_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcur_sentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/kss/base.py\u001b[0m in \u001b[0;36mprocess\u001b[0;34m(self, cur_chr, prev_1, prev_2, prev_3, prev_4, single_stack, double_stack)\u001b[0m\n", | |
| "\u001b[0;31mTypeError\u001b[0m: do_push_pop_symbol() missing 1 required positional argument: 'current_ch'" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "9KWyr7_LYE8q" | |
| }, | |
| "source": [ | |
| "# KcBERT-base MLM Finetune 학습하기 (GPU)\n", | |
| "\n", | |
| "- Huggingface Transformers에서 제공하는 `run_mlm.py` 파일을 이용해 KcBERT weight과 vocab을 이용해 MLM 학습 " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "q9Rta_FPXZ0L", | |
| "outputId": "458e40e2-bd56-4696-bac0-f96e1b21aa74" | |
| }, | |
| "source": [ | |
| "!mkdir ./test-mlm" | |
| ], | |
| "execution_count": 17, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "mkdir: cannot create directory ‘./test-mlm’: File exists\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "bumVyJhPXcZe", | |
| "outputId": "ca4b7f32-5b78-4235-eae8-d7982c5c0c4c" | |
| }, | |
| "source": [ | |
| "!wget -O run_mlm.py https://raw.githubusercontent.com/huggingface/transformers/72aee83ced5f31302c5e331d896412737287f976/examples/pytorch/language-modeling/run_mlm.py" | |
| ], | |
| "execution_count": 18, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "--2021-08-16 05:39:13-- https://raw.githubusercontent.com/huggingface/transformers/72aee83ced5f31302c5e331d896412737287f976/examples/pytorch/language-modeling/run_mlm.py\n", | |
| "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", | |
| "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", | |
| "HTTP request sent, awaiting response... 200 OK\n", | |
| "Length: 24078 (24K) [text/plain]\n", | |
| "Saving to: ‘run_mlm.py’\n", | |
| "\n", | |
| "\rrun_mlm.py 0%[ ] 0 --.-KB/s \rrun_mlm.py 100%[===================>] 23.51K --.-KB/s in 0.002s \n", | |
| "\n", | |
| "2021-08-16 05:39:14 (15.1 MB/s) - ‘run_mlm.py’ saved [24078/24078]\n", | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "Q5E3GLcOYs9y", | |
| "outputId": "169e7894-4f23-4b83-deaa-9711260add5b" | |
| }, | |
| "source": [ | |
| "!nvidia-smi" | |
| ], | |
| "execution_count": 19, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Mon Aug 16 05:39:14 2021 \n", | |
| "+-----------------------------------------------------------------------------+\n", | |
| "| NVIDIA-SMI 470.42.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", | |
| "|-------------------------------+----------------------+----------------------+\n", | |
| "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
| "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", | |
| "| | | MIG M. |\n", | |
| "|===============================+======================+======================|\n", | |
| "| 0 Tesla P100-PCIE... Off | 00000000:00:04.0 Off | 0 |\n", | |
| "| N/A 40C P0 27W / 250W | 0MiB / 16280MiB | 0% Default |\n", | |
| "| | | N/A |\n", | |
| "+-------------------------------+----------------------+----------------------+\n", | |
| " \n", | |
| "+-----------------------------------------------------------------------------+\n", | |
| "| Processes: |\n", | |
| "| GPU GI CI PID Type Process name GPU Memory |\n", | |
| "| ID ID Usage |\n", | |
| "|=============================================================================|\n", | |
| "| No running processes found |\n", | |
| "+-----------------------------------------------------------------------------+\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "eYJkBneJd9wd", | |
| "outputId": "37d181a6-2c10-41b2-ba97-918566f4c8bd" | |
| }, | |
| "source": [ | |
| "!ls" | |
| ], | |
| "execution_count": 20, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "korean_petitions_safe.txt\tKorpora sample_data\n", | |
| "korean_petitions_safe.txt.lock\trun_mlm.py test-mlm\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "4763DVvUeBPm" | |
| }, | |
| "source": [ | |
| "!head -n 10 korean_petitions_safe.txt" | |
| ], | |
| "execution_count": 21, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "ikFSD-VzXbVS", | |
| "outputId": "9bc3b559-6771-46d8-a8a9-55376e8653b3" | |
| }, | |
| "source": [ | |
| "!python run_mlm.py \\\n", | |
| " --model_name_or_path beomi/kcbert-base \\\n", | |
| " --train_file korean_petitions_safe.txt \\\n", | |
| " --do_train \\\n", | |
| " --output_dir ./test-mlm" | |
| ], | |
| "execution_count": 22, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "2021-08-16 05:39:15.893093: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", | |
| "08/16/2021 05:39:17 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", | |
| "08/16/2021 05:39:17 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", | |
| "_n_gpu=1,\n", | |
| "adafactor=False,\n", | |
| "adam_beta1=0.9,\n", | |
| "adam_beta2=0.999,\n", | |
| "adam_epsilon=1e-08,\n", | |
| "dataloader_drop_last=False,\n", | |
| "dataloader_num_workers=0,\n", | |
| "dataloader_pin_memory=True,\n", | |
| "ddp_find_unused_parameters=None,\n", | |
| "debug=[],\n", | |
| "deepspeed=None,\n", | |
| "disable_tqdm=False,\n", | |
| "do_eval=False,\n", | |
| "do_predict=False,\n", | |
| "do_train=True,\n", | |
| "eval_accumulation_steps=None,\n", | |
| "eval_steps=None,\n", | |
| "evaluation_strategy=IntervalStrategy.NO,\n", | |
| "fp16=False,\n", | |
| "fp16_backend=auto,\n", | |
| "fp16_full_eval=False,\n", | |
| "fp16_opt_level=O1,\n", | |
| "gradient_accumulation_steps=1,\n", | |
| "greater_is_better=None,\n", | |
| "group_by_length=False,\n", | |
| "ignore_data_skip=False,\n", | |
| "label_names=None,\n", | |
| "label_smoothing_factor=0.0,\n", | |
| "learning_rate=5e-05,\n", | |
| "length_column_name=length,\n", | |
| "load_best_model_at_end=False,\n", | |
| "local_rank=-1,\n", | |
| "log_level=-1,\n", | |
| "log_level_replica=-1,\n", | |
| "log_on_each_node=True,\n", | |
| "logging_dir=./test-mlm/runs/Aug16_05-39-17_cb60fe2c1a57,\n", | |
| "logging_first_step=False,\n", | |
| "logging_steps=500,\n", | |
| "logging_strategy=IntervalStrategy.STEPS,\n", | |
| "lr_scheduler_type=SchedulerType.LINEAR,\n", | |
| "max_grad_norm=1.0,\n", | |
| "max_steps=-1,\n", | |
| "metric_for_best_model=None,\n", | |
| "mp_parameters=,\n", | |
| "no_cuda=False,\n", | |
| "num_train_epochs=3.0,\n", | |
| "output_dir=./test-mlm,\n", | |
| "overwrite_output_dir=False,\n", | |
| "past_index=-1,\n", | |
| "per_device_eval_batch_size=8,\n", | |
| "per_device_train_batch_size=8,\n", | |
| "prediction_loss_only=False,\n", | |
| "push_to_hub=False,\n", | |
| "push_to_hub_model_id=test-mlm,\n", | |
| "push_to_hub_organization=None,\n", | |
| "push_to_hub_token=None,\n", | |
| "remove_unused_columns=True,\n", | |
| "report_to=['tensorboard'],\n", | |
| "resume_from_checkpoint=None,\n", | |
| "run_name=./test-mlm,\n", | |
| "save_on_each_node=False,\n", | |
| "save_steps=500,\n", | |
| "save_strategy=IntervalStrategy.STEPS,\n", | |
| "save_total_limit=None,\n", | |
| "seed=42,\n", | |
| "sharded_ddp=[],\n", | |
| "skip_memory_metrics=True,\n", | |
| "tpu_metrics_debug=False,\n", | |
| "tpu_num_cores=None,\n", | |
| "use_legacy_prediction_loop=False,\n", | |
| "warmup_ratio=0.0,\n", | |
| "warmup_steps=0,\n", | |
| "weight_decay=0.0,\n", | |
| ")\n", | |
| "08/16/2021 05:39:18 - WARNING - datasets.builder - Using custom data configuration default-dd91b8b7dab8cd99\n", | |
| "08/16/2021 05:39:18 - INFO - datasets.builder - Generating dataset text (/root/.cache/huggingface/datasets/text/default-dd91b8b7dab8cd99/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)\n", | |
| "Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/text/default-dd91b8b7dab8cd99/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...\n", | |
| "100% 1/1 [00:00<00:00, 9020.01it/s]\n", | |
| "08/16/2021 05:39:18 - INFO - datasets.utils.download_manager - Downloading took 0.0 min\n", | |
| "08/16/2021 05:39:18 - INFO - datasets.utils.download_manager - Checksum Computation took 0.0 min\n", | |
| "100% 1/1 [00:00<00:00, 1340.03it/s]\n", | |
| "08/16/2021 05:39:18 - INFO - datasets.utils.info_utils - Unable to verify checksums.\n", | |
| "08/16/2021 05:39:18 - INFO - datasets.builder - Generating split train\n", | |
| "Traceback (most recent call last):\n", | |
| " File \"run_mlm.py\", line 550, in <module>\n", | |
| " main()\n", | |
| " File \"run_mlm.py\", line 287, in main\n", | |
| " raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)\n", | |
| " File \"/usr/local/lib/python3.7/dist-packages/datasets/load.py\", line 852, in load_dataset\n", | |
| " use_auth_token=use_auth_token,\n", | |
| " File \"/usr/local/lib/python3.7/dist-packages/datasets/builder.py\", line 616, in download_and_prepare\n", | |
| " dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs\n", | |
| " File \"/usr/local/lib/python3.7/dist-packages/datasets/builder.py\", line 693, in _download_and_prepare\n", | |
| " self._prepare_split(split_generator, **prepare_split_kwargs)\n", | |
| " File \"/usr/local/lib/python3.7/dist-packages/datasets/builder.py\", line 1166, in _prepare_split\n", | |
| " num_examples, num_bytes = writer.finalize()\n", | |
| " File \"/usr/local/lib/python3.7/dist-packages/datasets/arrow_writer.py\", line 425, in finalize\n", | |
| " raise ValueError(\"Please pass `features` or at least one example when writing data\")\n", | |
| "ValueError: Please pass `features` or at least one example when writing data\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "81n_IaCRdP2w" | |
| }, | |
| "source": [ | |
| "### 학습 완료후 아래 에러는 무시하셔도 됩니다.\n", | |
| "\n", | |
| "- 학습 완료된 파일들은 `test-mlm` 폴더 내에 들어있습니다 :)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "wDXIj3uadNYh" | |
| }, | |
| "source": [ | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "fDq7EV6XYTho" | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": 22, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment