Last active
March 21, 2021 21:28
-
-
Save neoyipeng2018/73b61bec5e7034a92ea0a4e393202f7d to your computer and use it in GitHub Desktop.
Using Embeddings for Financial Sentiment Classification
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "Using Embeddings for Financial Sentiment Classification", | |
| "provenance": [], | |
| "collapsed_sections": [ | |
| "7jk8iwKC57b7", | |
| "IjnvPbcXUmbz", | |
| "5toz2xc6UeNd", | |
| "Jq4Vqx0g6fK-" | |
| ], | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "accelerator": "GPU", | |
| "widgets": { | |
| "application/vnd.jupyter.widget-state+json": { | |
| "b39d76fce7e346dcb785811bc331b440": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_0a2bc9b24529487da58592a7b3b581c8", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_e0baf84f88184887b39f9b204e73d033", | |
| "IPY_MODEL_18b4e117c6924028aa2a5eab572a10ad" | |
| ] | |
| } | |
| }, | |
| "0a2bc9b24529487da58592a7b3b581c8": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "e0baf84f88184887b39f9b204e73d033": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_2762aefdcf3644fe8e892090731d220d", | |
| "_dom_classes": [], | |
| "description": "100%", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 1811, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 1811, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_c9617ce14cd74ec79a76d3a9bfff29b2" | |
| } | |
| }, | |
| "18b4e117c6924028aa2a5eab572a10ad": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_b6b486660a6347298f9b59601236c073", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 1811/1811 [02:39<00:00, 11.37it/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_4d766770c27948c4a22920e9fa397ba6" | |
| } | |
| }, | |
| "2762aefdcf3644fe8e892090731d220d": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "initial", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "c9617ce14cd74ec79a76d3a9bfff29b2": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "b6b486660a6347298f9b59601236c073": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "4d766770c27948c4a22920e9fa397ba6": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "bfb1e90fc662469dac770e832e072917": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_90241d13a80545788bfc5aec35076748", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_d04a55472f7b43538fce43ec6cbc15b8", | |
| "IPY_MODEL_63ba97555ef941218636097d9f920f31" | |
| ] | |
| } | |
| }, | |
| "90241d13a80545788bfc5aec35076748": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "d04a55472f7b43538fce43ec6cbc15b8": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_bd84a3f1dea84a82ab7da822472cc088", | |
| "_dom_classes": [], | |
| "description": "100%", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 453, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 453, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_c7c8cf83c7ae4eefb3fce43172bcc59b" | |
| } | |
| }, | |
| "63ba97555ef941218636097d9f920f31": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_922c377294ed4f4cbd97474fbfa607d0", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 453/453 [00:38<00:00, 11.72it/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_fa12a4671e7c49ccadf380d966ee4be1" | |
| } | |
| }, | |
| "bd84a3f1dea84a82ab7da822472cc088": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "initial", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "c7c8cf83c7ae4eefb3fce43172bcc59b": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "922c377294ed4f4cbd97474fbfa607d0": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "fa12a4671e7c49ccadf380d966ee4be1": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/neoyipeng2018/73b61bec5e7034a92ea0a4e393202f7d/magnitudephrasebank.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "gbjyDfFc_W_2", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "# Using Embeddings for Financial Sentiment Classification\n", | |
| "Recycling example: https://colab.research.google.com/drive/1lOcAhIffLW8XC6QsKzt5T_ZqPP4Y9eS4.\n", | |
| "\n", | |
| "Note, the original example is super old (Python2!!) and the keras model just doesn't generalize well. So I averaged the vectors and used good ol Random Forest." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "7jk8iwKC57b7", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "## Setup" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "nO-VAs78CnJI", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "First, we'll install some dependencies and download the Magnitude file we wish to use." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "dsIvcCA5_FBI", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "!pip install -q pymagnitude #tensorflow keras\n", | |
| "#glove: !curl -s http://magnitude.plasticity.ai/glove+subword/glove.6B.50d.magnitude --output vectors.magnitude\n", | |
| "#word2vec: !curl -s http://magnitude.plasticity.ai/word2vec+subword/GoogleNews-vectors-negative300.magnitude --output vectors.magnitude\n", | |
| "#fastText: !curl -s http://magnitude.plasticity.ai/fasttext+subword/wiki-news-300d-1M.magnitude --output vectors.magnitude\n", | |
| "#elmo light: !curl -s http://magnitude.plasticity.ai/elmo/light/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude --output vectors.magnitude\n", | |
| "!curl -s http://magnitude.plasticity.ai/elmo/light/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude --output vectors.magnitude" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "95Xg9EyU-ZYr", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "Next, we'll import what we need to create our model and define some hyperparameters." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "1ODh9a4szHt6", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from pymagnitude import *\n", | |
| "\n", | |
| "MAX_WORDS = 30 # The maximum number of words the sequence model will consider\n", | |
| "vectors = Magnitude('./vectors.magnitude', pad_to_length = MAX_WORDS)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "IjnvPbcXUmbz", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "## Getting Data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "AWllqkJDx_z7", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "Download from https://www.researchgate.net/publication/251231364_FinancialPhraseBank-v10. I'm using my google drive for ease." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "H1fnkEco1Hyh", | |
| "colab_type": "code", | |
| "outputId": "b25130f7-85bc-401e-ac28-2a2826592604", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 139 | |
| } | |
| }, | |
| "source": [ | |
| "from google.colab import drive\n", | |
| "drive.mount('/content/gdrive', force_remount=True)\n", | |
| "%cd '/content/gdrive/My Drive/Colab Notebooks'" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n", | |
| "\n", | |
| "Enter your authorization code:\n", | |
| "··········\n", | |
| "Mounted at /content/gdrive\n", | |
| "/content/gdrive/My Drive/Colab Notebooks\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "aRNHigUG4mI3", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "#download from https://www.researchgate.net/publication/251231364_FinancialPhraseBank-v10\n", | |
| "f = open(\"Sentences_AllAgree.txt\", \"r\",encoding = \"ISO-8859-1\")\n", | |
| "test_file = f.read().split('\\n')" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "BcpdkqEKJbEK", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from tqdm import notebook\n", | |
| "import pandas as pd" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "E9ksIuT1Jfsh", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "def avg_vec(df):\n", | |
| " vctrLs = []\n", | |
| " for txt in notebook.tqdm(df.text.values): vctrLs.append(np.average(vectors.query(txt.split(' ')), axis = 0))\n", | |
| " return np.array(vctrLs)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Ak3pY1PLJfor", | |
| "colab_type": "code", | |
| "outputId": "5f22ff96-5fb6-47e3-98f4-eebf10f1140e", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 111 | |
| } | |
| }, | |
| "source": [ | |
| "df=pd.read_csv('Sentences_AllAgree.txt',encoding = \"ISO-8859-1\", names=['text','sentiment'], delimiter= '@')\n", | |
| "df.dropna(inplace=True)\n", | |
| "df.head(2)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>text</th>\n", | |
| " <th>sentiment</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>According to Gran , the company has no plans t...</td>\n", | |
| " <td>neutral</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>For the last quarter of 2010 , Componenta 's n...</td>\n", | |
| " <td>positive</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " text sentiment\n", | |
| "0 According to Gran , the company has no plans t... neutral\n", | |
| "1 For the last quarter of 2010 , Componenta 's n... positive" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 78 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "txX9T_6DJyH3", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train=df.sample(frac=0.8,random_state=42)\n", | |
| "test=df.drop(train.index)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "R1--_SZCMZuz", | |
| "colab_type": "code", | |
| "outputId": "7ab097a9-c019-4374-e67b-818b8f759b8f", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 149, | |
| "referenced_widgets": [ | |
| "b39d76fce7e346dcb785811bc331b440", | |
| "0a2bc9b24529487da58592a7b3b581c8", | |
| "e0baf84f88184887b39f9b204e73d033", | |
| "18b4e117c6924028aa2a5eab572a10ad", | |
| "2762aefdcf3644fe8e892090731d220d", | |
| "c9617ce14cd74ec79a76d3a9bfff29b2", | |
| "b6b486660a6347298f9b59601236c073", | |
| "4d766770c27948c4a22920e9fa397ba6", | |
| "bfb1e90fc662469dac770e832e072917", | |
| "90241d13a80545788bfc5aec35076748", | |
| "d04a55472f7b43538fce43ec6cbc15b8", | |
| "63ba97555ef941218636097d9f920f31", | |
| "bd84a3f1dea84a82ab7da822472cc088", | |
| "c7c8cf83c7ae4eefb3fce43172bcc59b", | |
| "922c377294ed4f4cbd97474fbfa607d0", | |
| "fa12a4671e7c49ccadf380d966ee4be1" | |
| ] | |
| } | |
| }, | |
| "source": [ | |
| "xTrn,xTest=avg_vec(train),avg_vec(test)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "b39d76fce7e346dcb785811bc331b440", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| "HBox(children=(FloatProgress(value=0.0, max=1811.0), HTML(value='')))" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "/usr/local/lib/python3.6/dist-packages/pymagnitude/third_party/allennlp/nn/util.py:116: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than tensor.new_tensor(sourceTensor).\n", | |
| " index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))\n" | |
| ], | |
| "name": "stderr" | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "bfb1e90fc662469dac770e832e072917", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| "HBox(children=(FloatProgress(value=0.0, max=453.0), HTML(value='')))" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "5toz2xc6UeNd", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "## Modelling - Good Ol Random Forest" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "4yO5BE7bJa5c", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from sklearn.ensemble import RandomForestClassifier\n", | |
| "from sklearn.metrics import confusion_matrix" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "TvfwUeozJXIT", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "forest = RandomForestClassifier(n_estimators=100, random_state=0, max_features=0.5, \n", | |
| " max_depth=4 ,min_samples_split=5,\n", | |
| " oob_score=True, n_jobs=-1, min_samples_leaf=50)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "JsOZkLFKPCnl", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "def oneHot(row):\n", | |
| " if row=='negative': return -1\n", | |
| " if row=='neutral' : return 0\n", | |
| " if row=='positive' : return +1" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "y6cgq8bePWY4", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train.sentiment=train.sentiment.apply(oneHot)\n", | |
| "test.sentiment =test.sentiment.apply(oneHot)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "O8W73LKmOsrx", | |
| "colab_type": "code", | |
| "outputId": "b928d677-d679-4beb-ad1d-0ac372ebe785", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 153 | |
| } | |
| }, | |
| "source": [ | |
| "forest.fit(xTrn, train.sentiment)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", | |
| " criterion='gini', max_depth=4, max_features=0.5,\n", | |
| " max_leaf_nodes=None, max_samples=None,\n", | |
| " min_impurity_decrease=0.0, min_impurity_split=None,\n", | |
| " min_samples_leaf=50, min_samples_split=5,\n", | |
| " min_weight_fraction_leaf=0.0, n_estimators=100,\n", | |
| " n_jobs=-1, oob_score=True, random_state=0, verbose=0,\n", | |
| " warm_start=False)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 127 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "eg-K2b3LPd3K", | |
| "colab_type": "code", | |
| "outputId": "0a0a0766-30ae-4806-93bf-034be834690e", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 51 | |
| } | |
| }, | |
| "source": [ | |
| "print(\"Accuracy on training set: {:.3f}\".format(forest.score(xTrn, train.sentiment)))\n", | |
| "oldscore = forest.oob_score_\n", | |
| "print(f'OOB score is {oldscore*100:.1f}%')\n", | |
| "#print('Out-of-bag score estimate: {:.3}'.format())" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Accuracy on training set: 0.795\n", | |
| "OOB score is 76.7%\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Tf8FUNBJRtzK", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "import matplotlib.pyplot as plt" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "AvRNn5IZPjxH", | |
| "colab_type": "code", | |
| "outputId": "2ce3834c-ab94-481b-fac1-6b28dc3676ab", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 331 | |
| } | |
| }, | |
| "source": [ | |
| "y_predict = forest.predict(xTest)\n", | |
| "confusion_matrix(test.sentiment, y_predict)\n", | |
| "\n", | |
| "\n", | |
| "cm = confusion_matrix(test.sentiment, y_predict)\n", | |
| "print(\"Confusion matrix:\\n{}\".format(cm))\n", | |
| "\n", | |
| "\n", | |
| "#Show confusion matrix in a separate window\n", | |
| "plt.matshow(cm)\n", | |
| "plt.title('Confusion matrix')\n", | |
| "plt.colorbar()\n", | |
| "plt.ylabel('True label')\n", | |
| "plt.xlabel('Predicted label')\n", | |
| "\n", | |
| "\n", | |
| "fmt = '.2f' #if normalize else 'd'\n", | |
| "plt.show()" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Confusion matrix:\n", | |
| "[[ 7 13 41]\n", | |
| " [ 0 280 6]\n", | |
| " [ 2 40 64]]\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAD2CAYAAAAj8rlYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAYvUlEQVR4nO3de7RcZX3G8e+TAAkEiMaEGAMI1qhNaUGacm0xQlWgl2BXRYQqtXShFbyBq4K1QrW0rrZeqgI2CDWIBqGAREECUllIF2gSjEgSkJRLSQiEAELkmnPOr3/sd2QI58y8c5iZPXvm+ay1V2ZfZr/vHJjfvPetiMDMLMeEsjNgZtXhgGFm2RwwzCybA4aZZXPAMLNsDhhmls0Bo2SStpf0XUmPS7r0JdznOEnXtjNvZZH0B5LuLDsf9mLyOIw8ko4FTgHeAGwGVgJnRcRNL/G+7wY+CBwUEUMvOaM9TlIAcyJibdl5sda5hJFB0inAF4F/AmYCuwPnAAvacPtXA78YhGCRQ9I2ZefBGogIbw02YCrwK+AdDa6ZRBFQHkjbF4FJ6dx8YB1wKrAR2AC8N537B+A5YEtK4wTgTOCiunvvAQSwTdr/S+BuilLOPcBxdcdvqnvfQcAy4PH070F1524APgP8T7rPtcD0MT5bLf9/W5f/o4AjgV8AjwKfqLt+P+Bm4Jfp2q8A26VzN6bP8mT6vO+su//HgQeBb9SOpff8Rkpj37T/KuBhYH7Z/2+0Y3vr/B3id39nUtYGXFN2fh3NmzsQmAxc0eCavwMOAPah+EJcCXwS+Pt0/pUUgWc28BbgvyR9JyLOSEX010bEXwBIOnOsRCRNAb4E/F5E3ClpFjBtlOumAVcBHwIWA+8ArpL02oh4JF12LHAEcD/wfeBjwGljJP3K9DeYTRGYzgOuA36XorS1XNLiiLgHGAY+CiwHdk33/gDwxYg4JH3evSNVSSTNT/efRlHamgDsX0s4Iv5X0seBiyTNA/4TWBQRN4z1d6qSTY8O8+Olu2Zdu+2s/53e4ew05SpJc68ANkXjKsNxwKcjYmNEPExRcnh33fkt6fyWiLia4tf19ePMzwiwl6TtI2JDRKwa5Zo/Au6KiG9ExFBELAbuAP6k7pr/jIhfRMTTwCUUwW4sWyjaa7YAFwPTgX+PiM0p/dXA3gARsSIibknp3gv8B/CmjM90RkQ8m/LzAhFxHrAW+DEwiyJA94lgOEaytl7ggNHcI8D0JnXrVwH31e3fl479+h5bBZyngB1bzUhEPElRjH8/sEHSVZLekJGfWp5m1+0/2EJ+HomI4fS69oV+qO7807X3S3qdpO9JelDSExTtPs1+GR+OiGeaXHMesBfw5Yh4tsm1lRHACJG19QIHjOZuBp6lqLeP5QGK4nTN7unYeDwJ7FC3/8r6kxGxNCLeQvFLewfFF6lZfmp5Wj/OPLXiXIp8zYmInYFPAGrynobfBkk7UrQLnQ+cmapcfSEItsRw1tYLBjpgSDpc0p2S1koatf4eEY8DnwLOlnSUpB0kbSvpCEn/ki5bDHxS0gxJ09P1F40zWyuBQyTtLmkqcHpdfmdKWpDaMp6lqNqMVla9GnidpGMlbSPpnRTtDf8q6fZx5ivXTsATwK9S6edvtjr/EPCaFu/578DyiPhriraZrza6WNJukn4oabWkVZI+3GJ6XeUSRgVImgicTdHwNxd4l6S5o10bEZ+jGIPxSYoW+vuBk4HvpEv+kaKR7zbg58Ct6VjLIuI64NvpXiuA79WdnpDy8QBFz8GbePEXktSw+ccUPTOPUPRwnAK8bTx5atHHKBpUN1OUfr691fkzgUWSfinp6GY3k7QAOJznP+cpwL6SjmvwtiHg1IiYS9EYfdJY/23LFsAwkbX1goEduCXpQODMiHhb2j8dICL+udSMdZCkPYDvRcReJWelqyRdCXwlBeOess/e28V135+Rde0usx9YERHzOpylhga5W3U2RUmhZh113XnWH1KQfCNFD0vPCWC4Qj/agxwwrM+lxtLLgI9ExBNl52csvdFhmmeQA8Z6YLe6/V3pTi+CdYGkbSmCxTcj4vKy8zOW6KH2iRyDHDCWAXMk7UkRKI6haKyzipMkii7YNRHx+bLz00gEbKlOvBjcXpI0kOpkYCmwBrhkjFGTfUHSYooxJa+XtE7SCWXnqYMOphhpe6iklWk7suxMjU4MZ269YJBLGKRh2leXnY9uiIh3lZ2HboliyYHe+IY1EcBIhUoYAx0wzHpBr5QecjhgmJWoGLjlgGFmmUbCAcPMMriEYWbZArElJpadjWwD261aI+nEsvPQTYP0eavwWWsljKp0qw58wAB6/n+qNhukz1uBzyqGY0LW1vROY0zrl3SmpPWjjUmRdHpa3uFOSU1nM7tKYlaiYsWttv1u16b13yppJ2CFpNoM3S9ExL/VX5ym/B8D/BbFKm0/kPS6utXVXqSnAsZ2mhzba0pX05ysKUyd8Ipyhs5M6H4Bb/KEKUzdZkbXP29M2rbbSTJ5u6nsPOVVXf+szzz7S54beiq7DtGu6kZEbKBYqZ2I2CxpDS9clnFrC4CL05KH90hay/Orvo+qpwLG9prCAZN7dARvB2jypLKz0DUje+atjN0PbrljYfa1EcqqbrRqq2n9BwMnS3oPxUJPp0bEYxTB5Ja6t62jcYBxG4ZZ2UZQ1kaxGPXyum3UNppRpvWfS/F8l30oSiCfG29ee6qEYTZoAvFcZH8NNzVbcWu0af0R8VDd+fN4ftnHlpd4cAnDrES1Rs+crZmxpvWnB17VvB2oLQS9BDhG0qS0zMMc4CeN0nAJw6xkw+0bGl6b1v9zSSvTsU9QLHBdeyrfvcD7ACJilaRLKB5ENQSc1KiHBBwwzEoViOE2FfQbTOsfcwmHiDgLOCs3DQcMs5KNdKCXpFMcMMxKVAwNd8AwswxVm3zmgGFWogg6MnCrUxwwzEr160FZleCAYVai4slnLmGYWSY3eppZlkBe09PM8rmEYWZZ3K1qZtmKJ5+5hGFmmXplgd8cDhhmJYqQSxhmls/jMMwsS7GAjqskZpalM4sAd4oDhlmJAtytamZ5PNLTzFrSxiefdVxHcyrp8PTMxrWSTutkWmZVVKyHoaytF3SshCFpInA28BaKJyotk7QkIlZ3Kk2zKnKVpLAfsDYi7gaQdDHFsxwdMMySog2jOlWSTgaM2cD9dfvrgP07mJ5ZJXloeAvS8yFPhOJJ6maDJBBDI+5WhcznNkbEQmAhwNQJr4gO5sesJ3mkZ2EZMCc9s3E9cAxwbAfTM6ucWi9JVXQsYETEkKSTgaXAROCCiFjVqfTMqsqNnklEXE2D5zqaDTqP9DSzlrgNw8yyFEv0OWCYWY6oVrdqdVpbzPpQbQGdnK0ZSbtJ+qGk1ZJWSfpwOj5N0nWS7kr/vjwdl6Qvpblet0nat1kaDhhmJRsJZW0ZhoBTI2IucABwkqS5wGnA9RExB7g+7QMcAcxJ24nAuc0ScMAwK1GtDaMdASMiNkTEren1ZmANxRSNBcCidNki4Kj0egFwYRRuAV4maVajNNyGYVayFho9p0taXre/MI2UfhFJewBvBH4MzIyIDenUg8DM9Hq0+V6zgQ2MwQHDrEQtjsPYFBHzml0kaUfgMuAjEfGE9Pz9IyIkjXsKhgOGWZkChto40lPSthTB4psRcXk6/JCkWRGxIVU5NqbjWfO96rkNw6xE7WzDUFGUOB9YExGfrzu1BDg+vT4euLLu+HtSb8kBwON1VZdRuYRhVrI2Dtw6GHg38HNJK9OxTwCfBS6RdAJwH3B0Onc1cCSwFngKeG+zBBwwzErUzrkkEXETjDlg47BRrg/gpFbScMAwK1l4aLiZ5fLkMzPLEuHJZ2aWTQyPVKez0gHDrGRuwzCzLF4Pw8zyRdGOURUOGGYlcy+JmWUJ3IZhZtm8ariZtWBkxAHDzDJEuEoybhHByDPPlJ2Nrll69y1lZ6FrDt9z/7Kz0D3PPtvS5a6SmFk2d6uaWTZXScwsSyAHDDPLV6EaiQOGWakCwt2qZpbLVRIzy9YXvSSSvkyD6lVEfKgjOTIbIP00l2R5g3Nm1g4B9EPAiIhF9fuSdoiIpzqfJbPBUqUqSdPFBCUdKGk1cEfa31vSOR3PmdmgiMytB+SsPvpF4G3AIwAR8TPgkE5mymxwiBjJ23pBVi9JRNxf/wRoYLgz2TEbMH04W/V+SQcBkZ4M/WFgTWezZTZAeqS6kSOnSvJ+iucvzgYeAPahxecxmlkjytzK17SEERGbgOO6kBezwdRPJQxJr5H0XUkPS9oo6UpJr+lG5swGQp/1knwLuASYBbwKuBRY3MlMmQ2MNPmsXb0kki5IP+y31x07U9J6SSvTdmTdudMlrZV0p6S3Nbt/TsDYISK+ERFDabsImJyVezNrrr0ljK8Dh49y/AsRsU/argaQNBc4Bvit9J5zJE1sdPMxA4akaZKmAd+XdJqkPSS9WtLfAldnZ9/MGgvlbTm3irgReDQz5QXAxRHxbETcA6wF9mv0hkaNniso4lotp++rzxdwemamzKwBdad94mRJ76GYI3ZqRDxG0fNZvxL1unRsTI3mkuzZjlyaWQOtVTemS6qfFLowIhZmvO9c4DMppc8AnwP+qoVc/lrWSE9JewFzqWu7iIgLx5OgmdXLr24AmyJiXqspRMRDv05NOg/4XtpdD+xWd+mu6diYcrpVzwC+nLY3A/8C/GlrWTazMXW4W1XSrLrdtwO1HpQlwDGSJknaE5gD/KTRvXJKGH8O7A38NCLeK2kmcFHr2TazUY2071aSFgPzKaov64AzgPmS9qEIO/eS2iMjYpWkS4DVwBBwUkQ0nCeWEzCejogRSUOSdgY28sJizFgZvwD4Y2BjROyVkY7Z4GnzAjoR8a5RDp/f4PqzgLNy758zDmO5pJcB51H0nNwK3Jzxvq8zen+wmdVR5G29IGcuyQfSy69KugbYOSJuy3jfjZL2eGnZMxsAPRIMcjRaBHjfRuci4tbOZMnMelWjEsbnGpwL4NB2ZEDSicCJAJPZoR23NKuUXqlu5Gg0cOvN3chAGniyEGBnTavQn86sTfpsxS0z65Sgrd2qnZbTSzIuqT/4ZuD1ktZJOqFTaZlVWV/1kozXGP3BZra1HgkGOXKGhkvSX0j6VNrfXVLDKbBm1oI+W3HrHOBAoFZi2Ayc3bEcmQ2Q3OpIlaok+0fEvpJ+ChARj0narsP5MhscfdZLsiUt2xUAkmZQqXZdsx7XI6WHHDkB40vAFcAuks6imL36yY7mymyAqEI/vzlzSb4paQVwGMVyfUdFhJ98ZtYOPdQ+kaNpwJC0O/AU8N36YxHxf53MmNnA6KeAAVzF84sBTwb2BO6kWJrczF6qfgoYEfHb9ftpFusHxrjczFpUpSpJy0PD07T2/TuQFzPrcTltGKfU7U4A9qV4iruZtUOFShg5bRg71b0eomjTuKwz2TEbMNFH3appwNZOEfGxLuXHbPD0QwlD0jYRMSTp4G5myGyQiGo1ejYqYfyEor1ipaQlwKXAk7WTEXF5h/NmNhj6JGDUTAYeoVjDszYeIwAHDLOXqo9Geu6Sekhu54VPcYdKxUSzHlehb1OjgDER2JEXBoqaCn1Es97WL70kGyLi013LidmgqtDPb6OAUZ1VPcyqqoeW38vRKGAc1rVcmA2wvmj0jIhHu5kRs4HVDwHDzLqjSiWMjj3IyMwytfExA5IukLRR0u11x6ZJuk7SXenfl6fjkvQlSWsl3dboAew1vVfC0OC0tR5x5LFlZ6FrnjlsStlZ6Jq46YbsazvwCIGvA18BLqw7dhpwfUR8VtJpaf/jwBHAnLTtD5xLk6UrXMIwK1sbSxgRcSOwdfvjAmBRer0IOKru+IVRuAV4maRZje7vgGFWsi48yGhmRGxIrx8EZqbXs4H7665bl46NqfeqJGaDJj8YTJe0vG5/YUQsbCmpiJDGH34cMMzKlv/13RQR88aRwkOSZkXEhlTl2JiOrwd2q7tu13RsTK6SmJWpO89WXQIcn14fD1xZd/w9qbfkAODxuqrLqFzCMCtbG3tJJC0G5lNUX9YBZwCfBS6RdAJwH3B0uvxq4EhgLcWzh97b7P4OGGYla+ds1Yh41xinXjTVIyICOKmV+ztgmJWsSiM9HTDMytRHs1XNrBscMMwsRz+tGm5m3eCAYWa5FNWJGA4YZmXqp0clmlkXVKeA4YBhVjY3eppZPgcMM8vSR49KNLNucMAwsxweuGVmLdFIdSKGA4ZZmTz5zMxa4YFbZpbPJQwzy+VGTzPLE0CFJp91bNVwSbtJ+qGk1ZJWSfpwp9IyqzKN5G29oJMljCHg1Ii4VdJOwApJ10XE6g6maVYpHoeRpOcbbEivN0taQ/EYNgcMs5qISlVJutKGIWkP4I3Aj7uRnlmVuIRRR9KOwGXARyLiiVHOnwicCDCZHTqdHbPe44BRkLQtRbD4ZkRcPto16WGyCwF21rQK/enM2sMlDECSgPOBNRHx+U6lY1ZpAVRoLkknH8Z8MPBu4FBJK9N2ZAfTM6skd6sCEXETRa+RmTXiXhIzy+U2DDPL4+ntZparGOlZnYjhgGFWtjY2aEq6F9gMDANDETFP0jTg28AewL3A0RHx2Hju38leEjPLoIisrQVvjoh9ImJe2j8NuD4i5gDXp/1xccAwK1NEMQ4jZxu/BcCi9HoRcNR4b+SAYVYyRd6WKYBrJa1I0y4AZqbJoAAPAjPHm1e3YZiVLb+6MV3S8rr9hWlqRb3fj4j1knYBrpN0xwuTipDG35HrgGFWptae3r6prl1i9NtFrE//bpR0BbAf8JCkWRGxQdIsYON4s+sqiVnZamtiNNuakDQlLVaFpCnAW4HbgSXA8emy44Erx5tVlzDMyta+YRgzgSuKeZ9sA3wrIq6RtAy4RNIJwH3A0eNNwAHDrGTtGrgVEXcDe49y/BHgsHak4YBhVqYAhj3S08wyiJYHZZXKAcOsbA4YZpbNAcPMsgRtnXzWaQ4YZiVzG4aZ5XPAMLMsETBSnTqJA4ZZ2aoTLxwwzMrmNgwzy+eAYWZZKvbks54KGJt5bNMPRi69r8vJTgc2dTnNwk9LSbWczztInxVenX9p3tT1XtFTASMiZnQ7TUnLmy1K0k8G6fNW5rM6YJhZlgCGq9NN4oBhVqqAcMCokq0XUe13g/R5q/FZK1QlGfg1PUdZdbmtJA1LWinpdkmXStrhJdzr65L+PL3+mqS5Da6dL+mgrY83+7yS7pU0Pff4Vtf8qtH5Ua4/U9LHWnlPKzr937Ytar0knX0uSdsMfMDogqfTU6j2Ap4D3l9/UtK4SnkR8dcRsbrBJfOBFwUM60FtWgS4GxwwuutHwGvTr/+PJC0BVkuaKOlfJS2TdJuk9wGo8BVJd0r6AbBL7UaSbpA0L70+XNKtkn4m6XpJe1AEpo+m0s0fSJoh6bKUxjJJB6f3vkLStZJWSfoaxfOBG5L0nfSgnFV1D8upnftCOn69pBnp2G9Iuia950eS3tCOP2bfqFDAcBtGl6SSxBHANenQvsBeEXFP+tI9HhG/J2kS8D+SrgXeCLwemEuxIvRq4IKt7jsDOA84JN1rWkQ8KumrwK8i4t/Sdd8CvhARN0naHVgK/CZwBnBTRHxa0h8BJ2R8nL9KaWwPLJN0WVpodgqwPCI+KulT6d4nU7QlvD8i7pK0P3AOcOg4/oz9JwKGh8vORTYHjM7bXtLK9PpHwPkUVYWfRMQ96fhbgd+ptU8AU4E5wCHA4ogYBh6Q9N+j3P8A4MbavSLi0THy8YfA3LQEPcDOknZMafxZeu9VknKe6v0hSW9Pr3dLeX2EYhrVt9Pxi4DLUxoHAZfWpT0pI43B0SOlhxwOGJ33dETsU38gfXGerD8EfDAilm513ZFtzMcE4ICIeGaUvGSTNJ8i+BwYEU9JugGYPMblkdL95dZ/A6tToYDhNozesBT4G0nbAkh6XXpy1Y3AO1MbxyzgzaO89xbgEEl7pvdOS8c3AzvVXXct8MHajqTaF/hG4Nh07Ajg5U3yOhV4LAWLN1CUcGomALVS0rEUVZ0ngHskvSOlIUkvenbG4OrK09vbxgGjN3yNon3iVkm3A/9BUfq7ArgrnbsQuHnrN0bEw8CJFMX/n/F8leC7wNtrjZ7Ah4B5qVF1Nc/31vwDRcBZRVE1+b8meb0G2EbSGuCzFAGr5klgv/QZDgU+nY4fB5yQ8rcKWJDxNxkMAREjWVsvUFSoOGTWb6ZuMyMO3PmorGuXPva1FWXPjXEbhlnZKvSj7YBhViZ3q5pZK8KLAJtZnt4ZxZnDAcOsTBVbos/dqmZli5G8LUOaV3SnpLWSTmt3Vl3CMCtRANGmEoakicDZwFuAdRTzfJY0mdXcEpcwzMoU0c4Sxn7A2oi4OyKeAy6mzYPkXMIwK1m0r1t1NnB/3f46YP923RwcMMxKtZnHlv4g/qvhSmZ1JktaXre/sNurijlgmJUoIg5v4+3WUyw3ULNrOtY2bsMw6x/LgDmS9pS0HXAMsKSdCbiEYdYnImJI0skUyyVMBC6IiFXtTMOzVc0sm6skZpbNAcPMsjlgmFk2Bwwzy+aAYWbZHDDMLJsDhpllc8Aws2z/D3pim9MC48yhAAAAAElFTkSuQmCC\n", | |
| "text/plain": [ | |
| "<Figure size 288x288 with 2 Axes>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [], | |
| "needs_background": "light" | |
| } | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "lA3dXqDuOsom", | |
| "colab_type": "code", | |
| "outputId": "e560d9b7-145b-4ccd-f423-74ede2b43098", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| } | |
| }, | |
| "source": [ | |
| "print(\"Accuracy on test set: {:.3f}\".format(forest.score(xTest, test.sentiment)))" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Accuracy on test set: 0.775\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "Jq4Vqx0g6fK-", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "## Predicting with the Model" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "kjwQm2Sryj5M", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "Since the model has been trained successfully, we can evaluate its performance on some test queries using Magnitude to convert the test queries into a sequence of vectors that be passed directly into the model for inference (prediction)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "n4U11gR9TuxF", | |
| "colab_type": "code", | |
| "outputId": "ee747375-61af-48a9-a5d9-8f08e37c3d81", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| } | |
| }, | |
| "source": [ | |
| "test.text.values[0]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "\"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .\"" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 159 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "cBJTvJsoT0EP", | |
| "colab_type": "code", | |
| "outputId": "338bac31-c7e5-4d5c-bf8c-88096b6d1067", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| } | |
| }, | |
| "source": [ | |
| "x=[np.average(vectors.query(test.text.values[0].split(' ')), axis = 0)]\n", | |
| "forest.predict(x)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "array([1])" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 160 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "4Rc-5dxWT7S5", | |
| "colab_type": "code", | |
| "outputId": "aedc3643-de21-4563-bf4f-fda935fcca8b", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| } | |
| }, | |
| "source": [ | |
| "test.sentiment.values[0]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "1" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 161 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "1kF5TpmrUP_a", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment