Skip to content

Instantly share code, notes, and snippets.

@neoyipeng2018
Last active March 21, 2021 21:28
Show Gist options
  • Select an option

  • Save neoyipeng2018/73b61bec5e7034a92ea0a4e393202f7d to your computer and use it in GitHub Desktop.

Select an option

Save neoyipeng2018/73b61bec5e7034a92ea0a4e393202f7d to your computer and use it in GitHub Desktop.
Using Embeddings for Financial Sentiment Classification
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Using Embeddings for Financial Sentiment Classification",
"provenance": [],
"collapsed_sections": [
"7jk8iwKC57b7",
"IjnvPbcXUmbz",
"5toz2xc6UeNd",
"Jq4Vqx0g6fK-"
],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"b39d76fce7e346dcb785811bc331b440": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_0a2bc9b24529487da58592a7b3b581c8",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_e0baf84f88184887b39f9b204e73d033",
"IPY_MODEL_18b4e117c6924028aa2a5eab572a10ad"
]
}
},
"0a2bc9b24529487da58592a7b3b581c8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"e0baf84f88184887b39f9b204e73d033": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_2762aefdcf3644fe8e892090731d220d",
"_dom_classes": [],
"description": "100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1811,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1811,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c9617ce14cd74ec79a76d3a9bfff29b2"
}
},
"18b4e117c6924028aa2a5eab572a10ad": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_b6b486660a6347298f9b59601236c073",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1811/1811 [02:39<00:00, 11.37it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_4d766770c27948c4a22920e9fa397ba6"
}
},
"2762aefdcf3644fe8e892090731d220d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"c9617ce14cd74ec79a76d3a9bfff29b2": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b6b486660a6347298f9b59601236c073": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"4d766770c27948c4a22920e9fa397ba6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"bfb1e90fc662469dac770e832e072917": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_90241d13a80545788bfc5aec35076748",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_d04a55472f7b43538fce43ec6cbc15b8",
"IPY_MODEL_63ba97555ef941218636097d9f920f31"
]
}
},
"90241d13a80545788bfc5aec35076748": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"d04a55472f7b43538fce43ec6cbc15b8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_bd84a3f1dea84a82ab7da822472cc088",
"_dom_classes": [],
"description": "100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 453,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 453,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c7c8cf83c7ae4eefb3fce43172bcc59b"
}
},
"63ba97555ef941218636097d9f920f31": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_922c377294ed4f4cbd97474fbfa607d0",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 453/453 [00:38<00:00, 11.72it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_fa12a4671e7c49ccadf380d966ee4be1"
}
},
"bd84a3f1dea84a82ab7da822472cc088": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"c7c8cf83c7ae4eefb3fce43172bcc59b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"922c377294ed4f4cbd97474fbfa607d0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"fa12a4671e7c49ccadf380d966ee4be1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/neoyipeng2018/73b61bec5e7034a92ea0a4e393202f7d/magnitudephrasebank.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "gbjyDfFc_W_2",
"colab_type": "text"
},
"source": [
"# Using Embeddings for Financial Sentiment Classification\n",
"Recycling example: https://colab.research.google.com/drive/1lOcAhIffLW8XC6QsKzt5T_ZqPP4Y9eS4.\n",
"\n",
"Note, the original example is super old (Python2!!) and the keras model just doesn't generalize well. So I averaged the vectors and used good ol Random Forest."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7jk8iwKC57b7",
"colab_type": "text"
},
"source": [
"## Setup"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nO-VAs78CnJI",
"colab_type": "text"
},
"source": [
"First, we'll install some dependencies and download the Magnitude file we wish to use."
]
},
{
"cell_type": "code",
"metadata": {
"id": "dsIvcCA5_FBI",
"colab_type": "code",
"colab": {}
},
"source": [
"!pip install -q pymagnitude #tensorflow keras\n",
"#glove: !curl -s http://magnitude.plasticity.ai/glove+subword/glove.6B.50d.magnitude --output vectors.magnitude\n",
"#word2vec: !curl -s http://magnitude.plasticity.ai/word2vec+subword/GoogleNews-vectors-negative300.magnitude --output vectors.magnitude\n",
"#fastText: !curl -s http://magnitude.plasticity.ai/fasttext+subword/wiki-news-300d-1M.magnitude --output vectors.magnitude\n",
"#elmo light: !curl -s http://magnitude.plasticity.ai/elmo/light/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude --output vectors.magnitude\n",
"!curl -s http://magnitude.plasticity.ai/elmo/light/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude --output vectors.magnitude"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "95Xg9EyU-ZYr",
"colab_type": "text"
},
"source": [
"Next, we'll import what we need to create our model and define some hyperparameters."
]
},
{
"cell_type": "code",
"metadata": {
"id": "1ODh9a4szHt6",
"colab_type": "code",
"colab": {}
},
"source": [
"from pymagnitude import *\n",
"\n",
"MAX_WORDS = 30 # The maximum number of words the sequence model will consider\n",
"vectors = Magnitude('./vectors.magnitude', pad_to_length = MAX_WORDS)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "IjnvPbcXUmbz",
"colab_type": "text"
},
"source": [
"## Getting Data"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AWllqkJDx_z7",
"colab_type": "text"
},
"source": [
"Download from https://www.researchgate.net/publication/251231364_FinancialPhraseBank-v10. I'm using my google drive for ease."
]
},
{
"cell_type": "code",
"metadata": {
"id": "H1fnkEco1Hyh",
"colab_type": "code",
"outputId": "b25130f7-85bc-401e-ac28-2a2826592604",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 139
}
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/gdrive', force_remount=True)\n",
"%cd '/content/gdrive/My Drive/Colab Notebooks'"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
"\n",
"Enter your authorization code:\n",
"··········\n",
"Mounted at /content/gdrive\n",
"/content/gdrive/My Drive/Colab Notebooks\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "aRNHigUG4mI3",
"colab_type": "code",
"colab": {}
},
"source": [
"#download from https://www.researchgate.net/publication/251231364_FinancialPhraseBank-v10\n",
"f = open(\"Sentences_AllAgree.txt\", \"r\",encoding = \"ISO-8859-1\")\n",
"test_file = f.read().split('\\n')"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "BcpdkqEKJbEK",
"colab_type": "code",
"colab": {}
},
"source": [
"from tqdm import notebook\n",
"import pandas as pd"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "E9ksIuT1Jfsh",
"colab_type": "code",
"colab": {}
},
"source": [
"def avg_vec(df):\n",
" vctrLs = []\n",
" for txt in notebook.tqdm(df.text.values): vctrLs.append(np.average(vectors.query(txt.split(' ')), axis = 0))\n",
" return np.array(vctrLs)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Ak3pY1PLJfor",
"colab_type": "code",
"outputId": "5f22ff96-5fb6-47e3-98f4-eebf10f1140e",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 111
}
},
"source": [
"df=pd.read_csv('Sentences_AllAgree.txt',encoding = \"ISO-8859-1\", names=['text','sentiment'], delimiter= '@')\n",
"df.dropna(inplace=True)\n",
"df.head(2)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>According to Gran , the company has no plans t...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>For the last quarter of 2010 , Componenta 's n...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text sentiment\n",
"0 According to Gran , the company has no plans t... neutral\n",
"1 For the last quarter of 2010 , Componenta 's n... positive"
]
},
"metadata": {
"tags": []
},
"execution_count": 78
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "txX9T_6DJyH3",
"colab_type": "code",
"colab": {}
},
"source": [
"train=df.sample(frac=0.8,random_state=42)\n",
"test=df.drop(train.index)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "R1--_SZCMZuz",
"colab_type": "code",
"outputId": "7ab097a9-c019-4374-e67b-818b8f759b8f",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 149,
"referenced_widgets": [
"b39d76fce7e346dcb785811bc331b440",
"0a2bc9b24529487da58592a7b3b581c8",
"e0baf84f88184887b39f9b204e73d033",
"18b4e117c6924028aa2a5eab572a10ad",
"2762aefdcf3644fe8e892090731d220d",
"c9617ce14cd74ec79a76d3a9bfff29b2",
"b6b486660a6347298f9b59601236c073",
"4d766770c27948c4a22920e9fa397ba6",
"bfb1e90fc662469dac770e832e072917",
"90241d13a80545788bfc5aec35076748",
"d04a55472f7b43538fce43ec6cbc15b8",
"63ba97555ef941218636097d9f920f31",
"bd84a3f1dea84a82ab7da822472cc088",
"c7c8cf83c7ae4eefb3fce43172bcc59b",
"922c377294ed4f4cbd97474fbfa607d0",
"fa12a4671e7c49ccadf380d966ee4be1"
]
}
},
"source": [
"xTrn,xTest=avg_vec(train),avg_vec(test)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b39d76fce7e346dcb785811bc331b440",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=1811.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/pymagnitude/third_party/allennlp/nn/util.py:116: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than tensor.new_tensor(sourceTensor).\n",
" index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bfb1e90fc662469dac770e832e072917",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=453.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5toz2xc6UeNd",
"colab_type": "text"
},
"source": [
"## Modelling - Good Ol Random Forest"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4yO5BE7bJa5c",
"colab_type": "code",
"colab": {}
},
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import confusion_matrix"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "TvfwUeozJXIT",
"colab_type": "code",
"colab": {}
},
"source": [
"forest = RandomForestClassifier(n_estimators=100, random_state=0, max_features=0.5, \n",
" max_depth=4 ,min_samples_split=5,\n",
" oob_score=True, n_jobs=-1, min_samples_leaf=50)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "JsOZkLFKPCnl",
"colab_type": "code",
"colab": {}
},
"source": [
"def oneHot(row):\n",
" if row=='negative': return -1\n",
" if row=='neutral' : return 0\n",
" if row=='positive' : return +1"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "y6cgq8bePWY4",
"colab_type": "code",
"colab": {}
},
"source": [
"train.sentiment=train.sentiment.apply(oneHot)\n",
"test.sentiment =test.sentiment.apply(oneHot)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "O8W73LKmOsrx",
"colab_type": "code",
"outputId": "b928d677-d679-4beb-ad1d-0ac372ebe785",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 153
}
},
"source": [
"forest.fit(xTrn, train.sentiment)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
" criterion='gini', max_depth=4, max_features=0.5,\n",
" max_leaf_nodes=None, max_samples=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=50, min_samples_split=5,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100,\n",
" n_jobs=-1, oob_score=True, random_state=0, verbose=0,\n",
" warm_start=False)"
]
},
"metadata": {
"tags": []
},
"execution_count": 127
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "eg-K2b3LPd3K",
"colab_type": "code",
"outputId": "0a0a0766-30ae-4806-93bf-034be834690e",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
}
},
"source": [
"print(\"Accuracy on training set: {:.3f}\".format(forest.score(xTrn, train.sentiment)))\n",
"oldscore = forest.oob_score_\n",
"print(f'OOB score is {oldscore*100:.1f}%')\n",
"#print('Out-of-bag score estimate: {:.3}'.format())"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Accuracy on training set: 0.795\n",
"OOB score is 76.7%\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Tf8FUNBJRtzK",
"colab_type": "code",
"colab": {}
},
"source": [
"import matplotlib.pyplot as plt"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "AvRNn5IZPjxH",
"colab_type": "code",
"outputId": "2ce3834c-ab94-481b-fac1-6b28dc3676ab",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 331
}
},
"source": [
"y_predict = forest.predict(xTest)\n",
"confusion_matrix(test.sentiment, y_predict)\n",
"\n",
"\n",
"cm = confusion_matrix(test.sentiment, y_predict)\n",
"print(\"Confusion matrix:\\n{}\".format(cm))\n",
"\n",
"\n",
"#Show confusion matrix in a separate window\n",
"plt.matshow(cm)\n",
"plt.title('Confusion matrix')\n",
"plt.colorbar()\n",
"plt.ylabel('True label')\n",
"plt.xlabel('Predicted label')\n",
"\n",
"\n",
"fmt = '.2f' #if normalize else 'd'\n",
"plt.show()"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Confusion matrix:\n",
"[[ 7 13 41]\n",
" [ 0 280 6]\n",
" [ 2 40 64]]\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 288x288 with 2 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "lA3dXqDuOsom",
"colab_type": "code",
"outputId": "e560d9b7-145b-4ccd-f423-74ede2b43098",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"print(\"Accuracy on test set: {:.3f}\".format(forest.score(xTest, test.sentiment)))"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Accuracy on test set: 0.775\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Jq4Vqx0g6fK-",
"colab_type": "text"
},
"source": [
"## Predicting with the Model"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kjwQm2Sryj5M",
"colab_type": "text"
},
"source": [
"Since the model has been trained successfully, we can evaluate its performance on some test queries using Magnitude to convert the test queries into a sequence of vectors that be passed directly into the model for inference (prediction)."
]
},
{
"cell_type": "code",
"metadata": {
"id": "n4U11gR9TuxF",
"colab_type": "code",
"outputId": "ee747375-61af-48a9-a5d9-8f08e37c3d81",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"test.text.values[0]"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .\""
]
},
"metadata": {
"tags": []
},
"execution_count": 159
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "cBJTvJsoT0EP",
"colab_type": "code",
"outputId": "338bac31-c7e5-4d5c-bf8c-88096b6d1067",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"x=[np.average(vectors.query(test.text.values[0].split(' ')), axis = 0)]\n",
"forest.predict(x)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([1])"
]
},
"metadata": {
"tags": []
},
"execution_count": 160
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "4Rc-5dxWT7S5",
"colab_type": "code",
"outputId": "aedc3643-de21-4563-bf4f-fda935fcca8b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"test.sentiment.values[0]"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"1"
]
},
"metadata": {
"tags": []
},
"execution_count": 161
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "1kF5TpmrUP_a",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment