Created
September 21, 2021 07:35
-
-
Save changjonathanc/7387724e61d915e8d6fb46b9028fe648 to your computer and use it in GitHub Desktop.
parse-c4-date-from-url.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 5, | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.8" | |
| }, | |
| "colab": { | |
| "name": "parse-c4-date-from-url.ipynb", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "include_colab_link": true | |
| }, | |
| "widgets": { | |
| "application/vnd.jupyter.widget-state+json": { | |
| "c8d31cebbeeb4ddbaa0e7883f5d1fd54": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_58d455318a1d4923863c0c5eb6074218", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_d947e6bff04943528e354ac20aa97d38", | |
| "IPY_MODEL_53643ac0a96341be8dec1d5b69b63d8f", | |
| "IPY_MODEL_edd30876b2ed4e448f4fa76e529e493c" | |
| ] | |
| } | |
| }, | |
| "58d455318a1d4923863c0c5eb6074218": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "d947e6bff04943528e354ac20aa97d38": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_0b392d0d3b73412ea44ff542c2da23a3", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": "100%", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_e202bb21626141aba9217c195a0e036c" | |
| } | |
| }, | |
| "53643ac0a96341be8dec1d5b69b63d8f": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_280a0b1a29564f748dd88bfad7bce35b", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 1, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 1, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_4fced6bc1a0d4bd8bf27ef22172a2e27" | |
| } | |
| }, | |
| "edd30876b2ed4e448f4fa76e529e493c": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_abe2de1f80474ade996bbeead3004fb4", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 1/1 [00:01<00:00, 1.10s/it]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_3bd2e03b895044c9932f5d9c775231c4" | |
| } | |
| }, | |
| "0b392d0d3b73412ea44ff542c2da23a3": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "e202bb21626141aba9217c195a0e036c": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "280a0b1a29564f748dd88bfad7bce35b": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "4fced6bc1a0d4bd8bf27ef22172a2e27": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "abe2de1f80474ade996bbeead3004fb4": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "3bd2e03b895044c9932f5d9c775231c4": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "b896fc2ef3c1449dab71e73755050a53": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_027ef8ff9cd441dd86159909fdd92a95", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_d03660e7c0364cc28652568b9031cf3a", | |
| "IPY_MODEL_0b7431c9df444f5c92046b5b5b6bcb16", | |
| "IPY_MODEL_3fcba8cc9da14d49a286cdb7e00185e8" | |
| ] | |
| } | |
| }, | |
| "027ef8ff9cd441dd86159909fdd92a95": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "d03660e7c0364cc28652568b9031cf3a": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_f11fee74c0a843f2ad0e32f2a9c1acca", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": "convert url to date: 3%", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_379dfd79d8264851a510be3541fb4365" | |
| } | |
| }, | |
| "0b7431c9df444f5c92046b5b5b6bcb16": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_6946565271664a5f9f07df351966b75e", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "", | |
| "max": 13799838, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 367442, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_67d11f086ea54d509ee9fbbaffe04526" | |
| } | |
| }, | |
| "3fcba8cc9da14d49a286cdb7e00185e8": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_d78418f706394df18d5b87371d6f8a5c", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 367102/13799838 [01:57<1:06:10, 3383.54ex/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_c1102206267d489b9fae8df35d5c65d7" | |
| } | |
| }, | |
| "f11fee74c0a843f2ad0e32f2a9c1acca": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "379dfd79d8264851a510be3541fb4365": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "6946565271664a5f9f07df351966b75e": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "67d11f086ea54d509ee9fbbaffe04526": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "d78418f706394df18d5b87371d6f8a5c": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "c1102206267d489b9fae8df35d5c65d7": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/cccntu/7387724e61d915e8d6fb46b9028fe648/parse-c4-date-from-url.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "DYPXhTLWQmxD", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "fd68f138-3954-494b-e5d1-a984ac0659f3" | |
| }, | |
| "source": [ | |
| "!pip install -q datasets" | |
| ], | |
| "id": "DYPXhTLWQmxD", | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "\u001b[K |████████████████████████████████| 270 kB 5.4 MB/s \n", | |
| "\u001b[K |████████████████████████████████| 52 kB 1.4 MB/s \n", | |
| "\u001b[K |████████████████████████████████| 119 kB 48.7 MB/s \n", | |
| "\u001b[K |████████████████████████████████| 1.3 MB 39.7 MB/s \n", | |
| "\u001b[K |████████████████████████████████| 243 kB 52.7 MB/s \n", | |
| "\u001b[K |████████████████████████████████| 294 kB 57.8 MB/s \n", | |
| "\u001b[K |████████████████████████████████| 142 kB 59.2 MB/s \n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "jobi6-WJRprV", | |
| "outputId": "c94778a1-58aa-4629-e37b-a44a584b8882" | |
| }, | |
| "source": [ | |
| "!git clone https://github.com/cccntu/dateutil" | |
| ], | |
| "id": "jobi6-WJRprV", | |
| "execution_count": 2, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Cloning into 'dateutil'...\n", | |
| "remote: Enumerating objects: 6543, done.\u001b[K\n", | |
| "remote: Counting objects: 100% (242/242), done.\u001b[K\n", | |
| "remote: Compressing objects: 100% (157/157), done.\u001b[K\n", | |
| "remote: Total 6543 (delta 113), reused 170 (delta 77), pack-reused 6301\u001b[K\n", | |
| "Receiving objects: 100% (6543/6543), 5.85 MiB | 19.38 MiB/s, done.\n", | |
| "Resolving deltas: 100% (4211/4211), done.\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "nPL4coMgR1Dq", | |
| "outputId": "d008cee4-075b-49f3-84b2-723f170e8ec6" | |
| }, | |
| "source": [ | |
| "!cd dateutil && git checkout date-only-282 && git pull && cd .." | |
| ], | |
| "id": "nPL4coMgR1Dq", | |
| "execution_count": 3, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Branch 'date-only-282' set up to track remote branch 'date-only-282' from 'origin'.\n", | |
| "Switched to a new branch 'date-only-282'\n", | |
| "Already up to date.\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "s3AE3ajFharR", | |
| "outputId": "2a633b91-ba9f-4b4c-939c-bde00738e3db" | |
| }, | |
| "source": [ | |
| "!pip uninstall -y python-dateutil" | |
| ], | |
| "id": "s3AE3ajFharR", | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Found existing installation: python-dateutil 2.8.2\n", | |
| "Uninstalling python-dateutil-2.8.2:\n", | |
| " Successfully uninstalled python-dateutil-2.8.2\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "sMFxpNxfQ-6q", | |
| "outputId": "2f03c183-a099-470a-f9fc-6dd7906bb078" | |
| }, | |
| "source": [ | |
| "!pip install -e dateutil" | |
| ], | |
| "id": "sMFxpNxfQ-6q", | |
| "execution_count": 4, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Obtaining file:///content/dateutil\n", | |
| " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
| " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
| " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", | |
| "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil==2.8.3.dev1+g697ca9a) (1.15.0)\n", | |
| "Installing collected packages: python-dateutil\n", | |
| " Attempting uninstall: python-dateutil\n", | |
| " Found existing installation: python-dateutil 2.8.2\n", | |
| " Uninstalling python-dateutil-2.8.2:\n", | |
| " Successfully uninstalled python-dateutil-2.8.2\n", | |
| " Running setup.py develop for python-dateutil\n", | |
| "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", | |
| "albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n", | |
| "Successfully installed python-dateutil-2.8.3.dev1+g697ca9a\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "DHsXeGUqTLmV" | |
| }, | |
| "source": [ | |
| "# restart the runtime to use the new dateutil version " | |
| ], | |
| "id": "DHsXeGUqTLmV", | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "5d8vUrryS8l-", | |
| "outputId": "7df4bc21-222a-4901-fdcc-79aa5918a594" | |
| }, | |
| "source": [ | |
| "# make sure this cell runs without error\n", | |
| "from dateutil.parser import parser\n", | |
| "parse('/2021/1/1/some text', fuzzy=True, date_only=True)" | |
| ], | |
| "id": "5d8vUrryS8l-", | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "datetime.datetime(2021, 1, 1, 0, 0)" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 1 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 83, | |
| "referenced_widgets": [ | |
| "c8d31cebbeeb4ddbaa0e7883f5d1fd54", | |
| "58d455318a1d4923863c0c5eb6074218", | |
| "d947e6bff04943528e354ac20aa97d38", | |
| "53643ac0a96341be8dec1d5b69b63d8f", | |
| "edd30876b2ed4e448f4fa76e529e493c", | |
| "0b392d0d3b73412ea44ff542c2da23a3", | |
| "e202bb21626141aba9217c195a0e036c", | |
| "280a0b1a29564f748dd88bfad7bce35b", | |
| "4fced6bc1a0d4bd8bf27ef22172a2e27", | |
| "abe2de1f80474ade996bbeead3004fb4", | |
| "3bd2e03b895044c9932f5d9c775231c4" | |
| ] | |
| }, | |
| "id": "vz8mDExG5eR7", | |
| "outputId": "87a1ba48-0457-440a-b9d2-160fda8e0a9e" | |
| }, | |
| "source": [ | |
| "from datasets import load_dataset\n", | |
| "\n", | |
| "# this is 'c4', 'newslike' subset, I pre-processed it to keep only the url field, \n", | |
| "# \n", | |
| "dataset = load_dataset(\"bs-modeling-metadata/c4_newslike_url_only\", keep_in_memory=True)" | |
| ], | |
| "id": "vz8mDExG5eR7", | |
| "execution_count": 13, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "Using custom data configuration c4_newslike_url_only-4ac73b9230e356e4\n", | |
| "Reusing dataset csv (/root/.cache/huggingface/datasets/csv/c4_newslike_url_only-4ac73b9230e356e4/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "c8d31cebbeeb4ddbaa0e7883f5d1fd54", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| " 0%| | 0/1 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "5eR9vTdj5jsl" | |
| }, | |
| "source": [ | |
| "\n", | |
| "from urllib.parse import urlsplit, unquote_plus\n", | |
| "def get_path_from_url(url):\n", | |
| " parts = urlsplit(url)\n", | |
| " return unquote_plus(parts.path)" | |
| ], | |
| "id": "5eR9vTdj5jsl", | |
| "execution_count": 8, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "778a75315tb5", | |
| "outputId": "85d5b875-4ffa-4634-c911-00776c8b3c55" | |
| }, | |
| "source": [ | |
| "# I modified the source and installed locally, such that it only parse date\n", | |
| "# otherwise it can fail if it fails to parse time, even if date can be parsed.\n", | |
| "# This way I can more dates parsed\n", | |
| "from dateutil.parser import parser, parse, ParserError\n", | |
| "\n", | |
| "# _parser = parser()\n", | |
| "# _parser.parse(datelike_paths[i], fuzzy=True)\n", | |
| "def parse_date(path):\n", | |
| " try:\n", | |
| " return parse(path, fuzzy=True, date_only=True)\n", | |
| " except ParserError:\n", | |
| " return None\n", | |
| " except OverflowError:\n", | |
| " # this happens, I don't know why, just ignore it\n", | |
| " return None\n", | |
| "\n", | |
| "\n", | |
| "def remove_improbable_date(x):\n", | |
| " if x is not None and (x.year < 1983 or x.year > 2021):\n", | |
| " return None\n", | |
| " return x\n", | |
| "\n", | |
| "\n", | |
| "parse_date(\"/2021/2/1/some name some number 123, 12\")" | |
| ], | |
| "id": "778a75315tb5", | |
| "execution_count": 9, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "datetime.datetime(2021, 2, 1, 0, 0)" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 9 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "gkczdRJS59mk" | |
| }, | |
| "source": [ | |
| "\n", | |
| "def map_dataset_fn(example):\n", | |
| " url = example['url']\n", | |
| " path = get_path_from_url(url)\n", | |
| " date = parse_date(path)\n", | |
| " date = remove_improbable_date(date)\n", | |
| " date = str(date) if date is not None else ''\n", | |
| " return {'date':date}" | |
| ], | |
| "id": "gkczdRJS59mk", | |
| "execution_count": 10, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "GsLljBHIPy_w" | |
| }, | |
| "source": [ | |
| "\n", | |
| "ds = dataset['train']" | |
| ], | |
| "id": "GsLljBHIPy_w", | |
| "execution_count": 11, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Mo12nLtlRaep" | |
| }, | |
| "source": [ | |
| "# ~3,000 examples/second on colab" | |
| ], | |
| "id": "Mo12nLtlRaep", | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 49, | |
| "referenced_widgets": [ | |
| "b896fc2ef3c1449dab71e73755050a53", | |
| "027ef8ff9cd441dd86159909fdd92a95", | |
| "d03660e7c0364cc28652568b9031cf3a", | |
| "0b7431c9df444f5c92046b5b5b6bcb16", | |
| "3fcba8cc9da14d49a286cdb7e00185e8", | |
| "f11fee74c0a843f2ad0e32f2a9c1acca", | |
| "379dfd79d8264851a510be3541fb4365", | |
| "6946565271664a5f9f07df351966b75e", | |
| "67d11f086ea54d509ee9fbbaffe04526", | |
| "d78418f706394df18d5b87371d6f8a5c", | |
| "c1102206267d489b9fae8df35d5c65d7" | |
| ] | |
| }, | |
| "id": "S-VA02_dJ24U", | |
| "outputId": "ad5861ec-de05-43cf-c034-7449a86b5d23" | |
| }, | |
| "source": [ | |
| "ds = ds.map(map_dataset_fn,desc='convert url to date', keep_in_memory=True)" | |
| ], | |
| "id": "S-VA02_dJ24U", | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "b896fc2ef3c1449dab71e73755050a53", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| "convert url to date: 0%| | 0/13799838 [00:00<?, ?ex/s]" | |
| ] | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "UcRRn3maRmNI" | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "id": "UcRRn3maRmNI", | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment