Last active
March 21, 2021 21:28
-
-
Save neoyipeng2018/73b61bec5e7034a92ea0a4e393202f7d to your computer and use it in GitHub Desktop.
Using Embeddings for Financial Sentiment Classification
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "Using Embeddings for Financial Sentiment Classification", | |
| "provenance": [], | |
| "collapsed_sections": [ | |
| "7jk8iwKC57b7", | |
| "IjnvPbcXUmbz", | |
| "5toz2xc6UeNd", | |
| "Jq4Vqx0g6fK-" | |
| ], | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "accelerator": "GPU", | |
| "widgets": { | |
| "application/vnd.jupyter.widget-state+json": { | |
| "b39d76fce7e346dcb785811bc331b440": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_0a2bc9b24529487da58592a7b3b581c8", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_e0baf84f88184887b39f9b204e73d033", | |
| "IPY_MODEL_18b4e117c6924028aa2a5eab572a10ad" | |
| ] | |
| } | |
| }, | |
| "0a2bc9b24529487da58592a7b3b581c8": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "e0baf84f88184887b39f9b204e73d033": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_2762aefdcf3644fe8e892090731d220d", | |
| "_dom_classes": [], | |
| "description": "100%", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 1811, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 1811, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_c9617ce14cd74ec79a76d3a9bfff29b2" | |
| } | |
| }, | |
| "18b4e117c6924028aa2a5eab572a10ad": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_b6b486660a6347298f9b59601236c073", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 1811/1811 [02:39<00:00, 11.37it/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_4d766770c27948c4a22920e9fa397ba6" | |
| } | |
| }, | |
| "2762aefdcf3644fe8e892090731d220d": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "initial", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "c9617ce14cd74ec79a76d3a9bfff29b2": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "b6b486660a6347298f9b59601236c073": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "4d766770c27948c4a22920e9fa397ba6": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "bfb1e90fc662469dac770e832e072917": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_90241d13a80545788bfc5aec35076748", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_d04a55472f7b43538fce43ec6cbc15b8", | |
| "IPY_MODEL_63ba97555ef941218636097d9f920f31" | |
| ] | |
| } | |
| }, | |
| "90241d13a80545788bfc5aec35076748": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "d04a55472f7b43538fce43ec6cbc15b8": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_bd84a3f1dea84a82ab7da822472cc088", | |
| "_dom_classes": [], | |
| "description": "100%", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 453, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 453, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_c7c8cf83c7ae4eefb3fce43172bcc59b" | |
| } | |
| }, | |
| "63ba97555ef941218636097d9f920f31": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_922c377294ed4f4cbd97474fbfa607d0", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 453/453 [00:38<00:00, 11.72it/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_fa12a4671e7c49ccadf380d966ee4be1" | |
| } | |
| }, | |
| "bd84a3f1dea84a82ab7da822472cc088": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "initial", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "c7c8cf83c7ae4eefb3fce43172bcc59b": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "922c377294ed4f4cbd97474fbfa607d0": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "fa12a4671e7c49ccadf380d966ee4be1": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/neoyipeng2018/73b61bec5e7034a92ea0a4e393202f7d/magnitudephrasebank.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "gbjyDfFc_W_2", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "# Using Embeddings for Financial Sentiment Classification\n", | |
| "Recycling example: https://colab.research.google.com/drive/1lOcAhIffLW8XC6QsKzt5T_ZqPP4Y9eS4.\n", | |
| "\n", | |
| "Note, the original example is super old (Python2!!) and the keras model just doesn't generalize well. So I averaged the vectors and used good ol Random Forest." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "7jk8iwKC57b7", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "## Setup" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "nO-VAs78CnJI", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "First, we'll install some dependencies and download the Magnitude file we wish to use." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "dsIvcCA5_FBI", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "!pip install -q pymagnitude #tensorflow keras\n", | |
| "#glove: !curl -s http://magnitude.plasticity.ai/glove+subword/glove.6B.50d.magnitude --output vectors.magnitude\n", | |
| "#word2vec: !curl -s http://magnitude.plasticity.ai/word2vec+subword/GoogleNews-vectors-negative300.magnitude --output vectors.magnitude\n", | |
| "#fastText: !curl -s http://magnitude.plasticity.ai/fasttext+subword/wiki-news-300d-1M.magnitude --output vectors.magnitude\n", | |
| "#elmo light: !curl -s http://magnitude.plasticity.ai/elmo/light/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude --output vectors.magnitude\n", | |
| "!curl -s http://magnitude.plasticity.ai/elmo/light/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude --output vectors.magnitude" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "95Xg9EyU-ZYr", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "Next, we'll import what we need to create our model and define some hyperparameters." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "1ODh9a4szHt6", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from pymagnitude import *\n", | |
| "\n", | |
| "MAX_WORDS = 30 # The maximum number of words the sequence model will consider\n", | |
| "vectors = Magnitude('./vectors.magnitude', pad_to_length = MAX_WORDS)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "IjnvPbcXUmbz", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "## Getting Data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "AWllqkJDx_z7", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "Download from https://www.researchgate.net/publication/251231364_FinancialPhraseBank-v10. I'm using my google drive for ease." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "H1fnkEco1Hyh", | |
| "colab_type": "code", | |
| "outputId": "b25130f7-85bc-401e-ac28-2a2826592604", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 139 | |
| } | |
| }, | |
| "source": [ | |
| "from google.colab import drive\n", | |
| "drive.mount('/content/gdrive', force_remount=True)\n", | |
| "%cd '/content/gdrive/My Drive/Colab Notebooks'" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n", | |
| "\n", | |
| "Enter your authorization code:\n", | |
| "··········\n", | |
| "Mounted at /content/gdrive\n", | |
| "/content/gdrive/My Drive/Colab Notebooks\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "aRNHigUG4mI3", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "#download from https://www.researchgate.net/publication/251231364_FinancialPhraseBank-v10\n", | |
| "f = open(\"Sentences_AllAgree.txt\", \"r\",encoding = \"ISO-8859-1\")\n", | |
| "test_file = f.read().split('\\n')" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "BcpdkqEKJbEK", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from tqdm import notebook\n", | |
| "import pandas as pd" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "E9ksIuT1Jfsh", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "def avg_vec(df):\n", | |
| " vctrLs = []\n", | |
| " for txt in notebook.tqdm(df.text.values): vctrLs.append(np.average(vectors.query(txt.split(' ')), axis = 0))\n", | |
| " return np.array(vctrLs)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Ak3pY1PLJfor", | |
| "colab_type": "code", | |
| "outputId": "5f22ff96-5fb6-47e3-98f4-eebf10f1140e", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 111 | |
| } | |
| }, | |
| "source": [ | |
| "df=pd.read_csv('Sentences_AllAgree.txt',encoding = \"ISO-8859-1\", names=['text','sentiment'], delimiter= '@')\n", | |
| "df.dropna(inplace=True)\n", | |
| "df.head(2)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>text</th>\n", | |
| " <th>sentiment</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>According to Gran , the company has no plans t...</td>\n", | |
| " <td>neutral</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>For the last quarter of 2010 , Componenta 's n...</td>\n", | |
| " <td>positive</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " text sentiment\n", | |
| "0 According to Gran , the company has no plans t... neutral\n", | |
| "1 For the last quarter of 2010 , Componenta 's n... positive" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 78 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "txX9T_6DJyH3", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train=df.sample(frac=0.8,random_state=42)\n", | |
| "test=df.drop(train.index)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "R1--_SZCMZuz", | |
| "colab_type": "code", | |
| "outputId": "7ab097a9-c019-4374-e67b-818b8f759b8f", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 149, | |
| "referenced_widgets": [ | |
| "b39d76fce7e346dcb785811bc331b440", | |
| "0a2bc9b24529487da58592a7b3b581c8", | |
| "e0baf84f88184887b39f9b204e73d033", | |
| "18b4e117c6924028aa2a5eab572a10ad", | |
| "2762aefdcf3644fe8e892090731d220d", | |
| "c9617ce14cd74ec79a76d3a9bfff29b2", | |
| "b6b486660a6347298f9b59601236c073", | |
| "4d766770c27948c4a22920e9fa397ba6", | |
| "bfb1e90fc662469dac770e832e072917", | |
| "90241d13a80545788bfc5aec35076748", | |
| "d04a55472f7b43538fce43ec6cbc15b8", | |
| "63ba97555ef941218636097d9f920f31", | |
| "bd84a3f1dea84a82ab7da822472cc088", | |
| "c7c8cf83c7ae4eefb3fce43172bcc59b", | |
| "922c377294ed4f4cbd97474fbfa607d0", | |
| "fa12a4671e7c49ccadf380d966ee4be1" | |
| ] | |
| } | |
| }, | |
| "source": [ | |
| "xTrn,xTest=avg_vec(train),avg_vec(test)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "b39d76fce7e346dcb785811bc331b440", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| "HBox(children=(FloatProgress(value=0.0, max=1811.0), HTML(value='')))" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "/usr/local/lib/python3.6/dist-packages/pymagnitude/third_party/allennlp/nn/util.py:116: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than tensor.new_tensor(sourceTensor).\n", | |
| " index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))\n" | |
| ], | |
| "name": "stderr" | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "bfb1e90fc662469dac770e832e072917", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| "HBox(children=(FloatProgress(value=0.0, max=453.0), HTML(value='')))" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "5toz2xc6UeNd", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "## Modelling - Good Ol Random Forest" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "4yO5BE7bJa5c", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from sklearn.ensemble import RandomForestClassifier\n", | |
| "from sklearn.metrics import confusion_matrix" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "TvfwUeozJXIT", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "forest = RandomForestClassifier(n_estimators=100, random_state=0, max_features=0.5, \n", | |
| " max_depth=4 ,min_samples_split=5,\n", | |
| " oob_score=True, n_jobs=-1, min_samples_leaf=50)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "JsOZkLFKPCnl", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "def oneHot(row):\n", | |
| " if row=='negative': return -1\n", | |
| " if row=='neutral' : return 0\n", | |
| " if row=='positive' : return +1" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "y6cgq8bePWY4", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train.sentiment=train.sentiment.apply(oneHot)\n", | |
| "test.sentiment =test.sentiment.apply(oneHot)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "O8W73LKmOsrx", | |
| "colab_type": "code", | |
| "outputId": "b928d677-d679-4beb-ad1d-0ac372ebe785", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 153 | |
| } | |
| }, | |
| "source": [ | |
| "forest.fit(xTrn, train.sentiment)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", | |
| " criterion='gini', max_depth=4, max_features=0.5,\n", | |
| " max_leaf_nodes=None, max_samples=None,\n", | |
| " min_impurity_decrease=0.0, min_impurity_split=None,\n", | |
| " min_samples_leaf=50, min_samples_split=5,\n", | |
| " min_weight_fraction_leaf=0.0, n_estimators=100,\n", | |
| " n_jobs=-1, oob_score=True, random_state=0, verbose=0,\n", | |
| " warm_start=False)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 127 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "eg-K2b3LPd3K", | |
| "colab_type": "code", | |
| "outputId": "0a0a0766-30ae-4806-93bf-034be834690e", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 51 | |
| } | |
| }, | |
| "source": [ | |
| "print(\"Accuracy on training set: {:.3f}\".format(forest.score(xTrn, train.sentiment)))\n", | |
| "oldscore = forest.oob_score_\n", | |
| "print(f'OOB score is {oldscore*100:.1f}%')\n", | |
| "#print('Out-of-bag score estimate: {:.3}'.format())" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Accuracy on training set: 0.795\n", | |
| "OOB score is 76.7%\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Tf8FUNBJRtzK", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "import matplotlib.pyplot as plt" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "AvRNn5IZPjxH", | |
| "colab_type": "code", | |
| "outputId": "2ce3834c-ab94-481b-fac1-6b28dc3676ab", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 331 | |
| } | |
| }, | |
| "source": [ | |
| "y_predict = forest.predict(xTest)\n", | |
| "confusion_matrix(test.sentiment, y_predict)\n", | |
| "\n", | |
| "\n", | |
| "cm = confusion_matrix(test.sentiment, y_predict)\n", | |
| "print(\"Confusion matrix:\\n{}\".format(cm))\n", | |
| "\n", | |
| "\n", | |
| "#Show confusion matrix in a separate window\n", | |
| "plt.matshow(cm)\n", | |
| "plt.title('Confusion matrix')\n", | |
| "plt.colorbar()\n", | |
| "plt.ylabel('True label')\n", | |
| "plt.xlabel('Predicted label')\n", | |
| "\n", | |
| "\n", | |
| "fmt = '.2f' #if normalize else 'd'\n", | |
| "plt.show()" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Confusion matrix:\n", | |
| "[[ 7 13 41]\n", | |
| " [ 0 280 6]\n", | |
| " [ 2 40 64]]\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<Figure size 288x288 with 2 Axes>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [], | |
| "needs_background": "light" | |
| } | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "lA3dXqDuOsom", | |
| "colab_type": "code", | |
| "outputId": "e560d9b7-145b-4ccd-f423-74ede2b43098", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| } | |
| }, | |
| "source": [ | |
| "print(\"Accuracy on test set: {:.3f}\".format(forest.score(xTest, test.sentiment)))" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Accuracy on test set: 0.775\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "Jq4Vqx0g6fK-", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "## Predicting with the Model" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "kjwQm2Sryj5M", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "Since the model has been trained successfully, we can evaluate its performance on some test queries using Magnitude to convert the test queries into a sequence of vectors that be passed directly into the model for inference (prediction)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "n4U11gR9TuxF", | |
| "colab_type": "code", | |
| "outputId": "ee747375-61af-48a9-a5d9-8f08e37c3d81", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| } | |
| }, | |
| "source": [ | |
| "test.text.values[0]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "\"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .\"" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 159 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "cBJTvJsoT0EP", | |
| "colab_type": "code", | |
| "outputId": "338bac31-c7e5-4d5c-bf8c-88096b6d1067", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| } | |
| }, | |
| "source": [ | |
| "x=[np.average(vectors.query(test.text.values[0].split(' ')), axis = 0)]\n", | |
| "forest.predict(x)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "array([1])" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 160 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "4Rc-5dxWT7S5", | |
| "colab_type": "code", | |
| "outputId": "aedc3643-de21-4563-bf4f-fda935fcca8b", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| } | |
| }, | |
| "source": [ | |
| "test.sentiment.values[0]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "1" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 161 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "1kF5TpmrUP_a", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment