Last active
December 26, 2019 19:33
-
-
Save akhileshravi/ec8e861903072643ca40a4ddd5358f4c to your computer and use it in GitHub Desktop.
16110007 Assignment 3 NLP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Name: Akhilesh Ravi | |
| Roll No.: 16110007 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "NLP_Assignment3_16110007", | |
| "provenance": [], | |
| "collapsed_sections": [] | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "9dTzSCUXYQyj", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "c335ef41-cb62-450e-a2d8-4bbfbb599a7d" | |
| }, | |
| "source": [ | |
| "from google.colab import drive\n", | |
| "drive.mount('/content/gdrive')" | |
| ], | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "YJ_e3TKg9U_N", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "path = \"/content/gdrive/My Drive/Semester 7/NLP/Assignment3/\"\n", | |
| "with open(path + \"train.txt\", 'r') as ftrain:\n", | |
| " train_text = ftrain.read()\n", | |
| "with open(path + \"test.txt\", 'r') as ftest:\n", | |
| " test_text = ftest.read()" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "IIFVs3aW78NQ", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "import nltk\n", | |
| "# nltk.download('stopwords')\n", | |
| "from nltk.corpus import stopwords\n", | |
| "import re" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "ac5LT9gO8b44", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "help(stopwords)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "zUoUl3RN86fc", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "a6ac02d2-1312-4aff-cfe9-d8ad82ae4f3a" | |
| }, | |
| "source": [ | |
| "stopwords_en = stopwords.words('english')\n", | |
| "exclude = ['very', 'not', 'never', 'no', 'ever', 'nothing', 'really', 'extremely']\n", | |
| "for i in exclude:\n", | |
| " if i not in stopwords_en:\n", | |
| " print(i, end=' ')" | |
| ], | |
| "execution_count": 5, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "never ever nothing really extremely " | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "2im06gLjauis", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "emojis = ['😂', '❤', '♥', '😍', '😭', '😘', '😊', '👌', '💕', '👏', '😁', '☺', '♡', '👍', '😩', '🙏', '✌', '😏', '😉', '🙌',\n", | |
| " '🙈', '💪', '😄', '😒', '💃', '💖', '😃', '😔', '😱', '🎉', '😜', '☯', '🌸', '💜', '💙', '✨', '😳', '💗', '★',\n", | |
| " '☀', '😡', '😎', '😢', '💋', '😋', '🙊', '😴', '🎶', '💞', '😌']\n", | |
| "emoji_dict = {emojis[i]: i for i in range(len(emojis))}\n", | |
| "# 50 most frequently used emojis from https://www.kaggle.com/thomasseleck/emoji-sentiment-data" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "kb4JWfw9-dvK", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train_id = []\n", | |
| "train_data = []\n", | |
| "train_hin, train_eng, train_o, train_labels = [], [], [], []\n", | |
| "# emoji_train = []\n", | |
| "for sample in train_text.split('\\n\\n'):\n", | |
| " \n", | |
| " lines_sample = sample.split('\\n')\n", | |
| " # print(lines_sample[0])\n", | |
| " try:\n", | |
| " train_labels.append(lines_sample[0].split()[2])\n", | |
| " train_id.append(lines_sample[0].split()[1])\n", | |
| " except IndexError:\n", | |
| " del train_id[-1]\n", | |
| " continue\n", | |
| " temp, temp_eng, temp_hin, temp_o = [], [], [], []\n", | |
| " \n", | |
| " \n", | |
| " for line in lines_sample[1:]:\n", | |
| " t = line.split('\\t')\n", | |
| "\n", | |
| " if t[1] != 'O':\n", | |
| " t[0]=re.sub('[\\W_]+', '', t[0])\n", | |
| " if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n", | |
| " continue\n", | |
| " if 'http' in t[0]:\n", | |
| " continue\n", | |
| " temp.append(t[0])\n", | |
| " if t[1] == 'Eng':\n", | |
| " temp_eng.append(t[0])\n", | |
| " elif t[1] == 'Hin':\n", | |
| " temp_hin.append(t[0])\n", | |
| " elif t[1] == 'O':\n", | |
| " temp_o.append(t[0])\n", | |
| " if temp == []:\n", | |
| " continue\n", | |
| " train_data.append(temp)\n", | |
| " train_eng.append(temp_eng)\n", | |
| " train_hin.append(temp_hin)\n", | |
| " train_o.append(temp_o)\n" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Vgrdip_g_Fej", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "bf6b5e72-10b6-4075-9f93-49a79dace711" | |
| }, | |
| "source": [ | |
| "print(len(train_text.split('\\n\\n')))" | |
| ], | |
| "execution_count": 8, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "15132\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "fZu39o6-0L2i", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "a2488ce9-89d1-4b49-c9d9-df756a66352c" | |
| }, | |
| "source": [ | |
| "len(train_data)" | |
| ], | |
| "execution_count": 9, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "15131" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 9 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "X3elTZuw0gQb", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "test_id = []\n", | |
| "test_data = []\n", | |
| "test_hin, test_eng, test_o, test_labels = [], [], [], []\n", | |
| "\n", | |
| "for sample in test_text.split('\\n\\n'):\n", | |
| " \n", | |
| " lines_sample = sample.split('\\n')\n", | |
| " # print(lines_sample[0])\n", | |
| " try:\n", | |
| " test_labels.append(lines_sample[0].split()[2])\n", | |
| " test_id.append(lines_sample[0].split()[1])\n", | |
| " except IndexError:\n", | |
| " del test_id[-1]\n", | |
| " continue\n", | |
| " temp, temp_eng, temp_hin, temp_o = [], [], [], []\n", | |
| " \n", | |
| " \n", | |
| " for line in lines_sample[1:]:\n", | |
| " t = line.split('\\t')\n", | |
| " t[0] = t[0].lower()\n", | |
| " if t[1] != 'O':\n", | |
| " t[0]=re.sub('[\\W_]+', '', t[0])\n", | |
| " if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n", | |
| " continue\n", | |
| " if 'http' in t[0]:\n", | |
| " continue\n", | |
| " temp.append(t[0])\n", | |
| " if t[1] == 'Eng':\n", | |
| " temp_eng.append(t[0])\n", | |
| " elif t[1] == 'Hin':\n", | |
| " temp_hin.append(t[0])\n", | |
| " elif t[1] == 'O':\n", | |
| " temp_o.append(t[0])\n", | |
| " if temp == []:\n", | |
| " continue\n", | |
| " test_data.append(temp)\n", | |
| " test_eng.append(temp_eng)\n", | |
| " test_hin.append(temp_hin)\n", | |
| " test_o.append(temp_o)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "KoLNJ4VA0wCF", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train_tweets = [' '.join(i) for i in train_data]\n", | |
| "test_tweets = [' '.join(i) for i in test_data]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "DA0RtaGc36nz", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train_tweets_dict = {}\n", | |
| "test_tweets_dict = {}" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "3wseReG-4oib", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train_tweets_dict['eng'] = [' '.join(i) for i in train_eng]\n", | |
| "test_tweets_dict['eng'] = [' '.join(i) for i in test_eng]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "uC29oWZv36k9", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train_tweets_dict['hin'] = [' '.join(i) for i in train_hin]\n", | |
| "test_tweets_dict['hin'] = [' '.join(i) for i in test_hin]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "5K_BLtFW38SK", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "train_tweets_dict['o'] = [' '.join(i) for i in train_o]\n", | |
| "test_tweets_dict['o'] = [' '.join(i) for i in test_o]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "sNh20-xu2Yq7", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "7fc16ef6-f5ce-4b31-afca-1858dc7d81b2" | |
| }, | |
| "source": [ | |
| "np.unique(train_labels)" | |
| ], | |
| "execution_count": 122, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "array(['negative', 'neutral', 'positive'], dtype='<U8')" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 122 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "n3u6_cLo07yx", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 81 | |
| }, | |
| "outputId": "13537e4a-5ddc-461a-a1db-ed66b752f647" | |
| }, | |
| "source": [ | |
| "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", | |
| "from keras.preprocessing.text import Tokenizer\n", | |
| "from sklearn.model_selection import train_test_split\n", | |
| "import pandas as pd\n", | |
| "import random\n", | |
| "import numpy as np\n", | |
| "from keras.preprocessing import sequence\n", | |
| "from keras.utils import np_utils\n", | |
| "\n", | |
| "from keras.models import Sequential\n", | |
| "from keras.layers.core import Dense, Dropout, Activation, Lambda\n", | |
| "from keras.layers.embeddings import Embedding\n", | |
| "from keras.layers.recurrent import LSTM, SimpleRNN, GRU\n", | |
| "from keras.preprocessing.text import Tokenizer\n", | |
| "from keras import optimizers" | |
| ], | |
| "execution_count": 22, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Using TensorFlow backend.\n" | |
| ], | |
| "name": "stderr" | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<p style=\"color: red;\">\n", | |
| "The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.<br>\n", | |
| "We recommend you <a href=\"https://www.tensorflow.org/guide/migrate\" target=\"_blank\">upgrade</a> now \n", | |
| "or ensure your notebook will continue to use TensorFlow 1.x via the <code>%tensorflow_version 1.x</code> magic:\n", | |
| "<a href=\"https://colab.research.google.com/notebooks/tensorflow_version.ipynb\" target=\"_blank\">more info</a>.</p>\n" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "ZJGGWOMY2cBR", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "label_values = {'negative':0, 'neutral':1, 'positive':2}" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "OKygtMNS2KVn", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "y_train = np.array([label_values[i] for i in train_labels])\n", | |
| "y_test = np.array([label_values[i] for i in test_labels])" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "CDe4kCvp76Ez", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "WbKqX45L1K9p", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "max_features = 20000\n", | |
| "tokenizer1 = Tokenizer(num_words=max_features)\n", | |
| "tokenizer1.fit_on_texts(train_tweets)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "kqUXn83O1i0V", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 52 | |
| }, | |
| "outputId": "5c7ed4a4-f86f-491b-dc28-794d11738843" | |
| }, | |
| "source": [ | |
| "max_len = 250\n", | |
| "num_classes = 3\n", | |
| "\n", | |
| "sequences_train = tokenizer1.texts_to_sequences(train_tweets)\n", | |
| "sequences_test = tokenizer1.texts_to_sequences(test_tweets)\n", | |
| "\n", | |
| "X_train = sequence.pad_sequences(sequences_train, maxlen=max_len)\n", | |
| "X_test = sequence.pad_sequences(sequences_test, maxlen=max_len)\n", | |
| "\n", | |
| "Y_train = np_utils.to_categorical(y_train, num_classes)\n", | |
| "Y_test = np_utils.to_categorical(y_test, num_classes)\n", | |
| "\n", | |
| "print('X_train shape:', X_train.shape)\n", | |
| "print('X_test shape:', X_test.shape)" | |
| ], | |
| "execution_count": 28, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "X_train shape: (15131, 250)\n", | |
| "X_test shape: (1869, 250)\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "AA6THqVE3Elf", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "input_dim = X_train.shape[1]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "nQJDXF6T22KB", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "input_dim = X_train.shape[1]\n", | |
| "model1 = Sequential()\n", | |
| "\n", | |
| "model1.add(Dense(input_dim))\n", | |
| "model1.add(Dropout(0.2))\n", | |
| "model1.add(Activation('relu'))\n", | |
| "model1.add(Dense(200))\n", | |
| "model1.add(Dropout(0.2))\n", | |
| "model1.add(Activation('tanh'))\n", | |
| "model1.add(Dense(100))\n", | |
| "model1.add(Dropout(0.2))\n", | |
| "model1.add(Activation('sigmoid'))\n", | |
| "model1.add(Dense(3))\n", | |
| "model1.add(Activation('softmax'))\n", | |
| "\n", | |
| "adam = optimizers.Adam(lr=0.01, decay=1e-6)\n", | |
| "\n", | |
| "model1.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Cz5Qg-hI2rk_", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 69 | |
| }, | |
| "outputId": "f121d048-6d4b-419d-82de-a50d24ea7cce" | |
| }, | |
| "source": [ | |
| "model1.fit(X_train, Y_train, batch_size = 256, epochs=1)" | |
| ], | |
| "execution_count": 34, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Epoch 1/1\n", | |
| "15131/15131 [==============================] - 1s 70us/step - loss: 0.6535 - acc: 0.6507\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<keras.callbacks.History at 0x7f0f220a2c18>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 34 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "IUh9VqF13a8q", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "81e6d0c1-ab8f-46ea-ae70-cc547f187fa5" | |
| }, | |
| "source": [ | |
| "preds = model1.predict_classes(X_test, verbose=0)\n", | |
| "np.sum(preds==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 35, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.4002140181915463" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 35 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "ILc7YMLjJsaF", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 538 | |
| }, | |
| "outputId": "8d5c648b-16cf-4850-b3fd-875fa4a5c1e4" | |
| }, | |
| "source": [ | |
| "model1.summary()" | |
| ], | |
| "execution_count": 36, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Model: \"sequential_2\"\n", | |
| "_________________________________________________________________\n", | |
| "Layer (type) Output Shape Param # \n", | |
| "=================================================================\n", | |
| "dense_5 (Dense) (None, 250) 62750 \n", | |
| "_________________________________________________________________\n", | |
| "dropout_4 (Dropout) (None, 250) 0 \n", | |
| "_________________________________________________________________\n", | |
| "activation_5 (Activation) (None, 250) 0 \n", | |
| "_________________________________________________________________\n", | |
| "dense_6 (Dense) (None, 200) 50200 \n", | |
| "_________________________________________________________________\n", | |
| "dropout_5 (Dropout) (None, 200) 0 \n", | |
| "_________________________________________________________________\n", | |
| "activation_6 (Activation) (None, 200) 0 \n", | |
| "_________________________________________________________________\n", | |
| "dense_7 (Dense) (None, 100) 20100 \n", | |
| "_________________________________________________________________\n", | |
| "dropout_6 (Dropout) (None, 100) 0 \n", | |
| "_________________________________________________________________\n", | |
| "activation_7 (Activation) (None, 100) 0 \n", | |
| "_________________________________________________________________\n", | |
| "dense_8 (Dense) (None, 3) 303 \n", | |
| "_________________________________________________________________\n", | |
| "activation_8 (Activation) (None, 3) 0 \n", | |
| "=================================================================\n", | |
| "Total params: 133,353\n", | |
| "Trainable params: 133,353\n", | |
| "Non-trainable params: 0\n", | |
| "_________________________________________________________________\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "8fDdsocU4zd4", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "tokenizers = {}\n", | |
| "for v in ['eng', 'hin', 'o']:\n", | |
| " tokenizers[v] = Tokenizer(num_words=max_features)\n", | |
| " tokenizers[v].fit_on_texts(train_tweets_dict[v])" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "gp4HxqeK5NHm", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 52 | |
| }, | |
| "outputId": "abb433b3-1990-439f-b712-331f6a101cd5" | |
| }, | |
| "source": [ | |
| "sequences_train_dict, sequences_test_dict = {}, {}\n", | |
| "X_train_dict, X_test_dict = {}, {}\n", | |
| "for v in ['eng', 'hin', 'o']:\n", | |
| " sequences_train_dict[v] = tokenizers[v].texts_to_sequences(train_tweets)\n", | |
| " sequences_test_dict[v] = tokenizers[v].texts_to_sequences(test_tweets)\n", | |
| "\n", | |
| " X_train_dict[v] = sequence.pad_sequences(sequences_train_dict[v], maxlen=max_len)\n", | |
| " X_test_dict[v] = sequence.pad_sequences(sequences_test_dict[v], maxlen=max_len)\n", | |
| "\n", | |
| "# Y_train = np_utils.to_categorical(y_train, num_classes)\n", | |
| "# Y_test = np_utils.to_categorical(y_test, num_classes)\n", | |
| "\n", | |
| "X_train2 = np.hstack(tuple([X_train] + [X_train_dict[v] for v in ['eng', 'hin', 'o']]))\n", | |
| "X_test2 = np.hstack(tuple([X_test] + [X_test_dict[v] for v in ['eng', 'hin', 'o']]))\n", | |
| "\n", | |
| "print('X_train2 shape:', X_train2.shape)\n", | |
| "print('X_test2 shape:', X_test2.shape)" | |
| ], | |
| "execution_count": 38, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "X_train2 shape: (15131, 1000)\n", | |
| "X_test2 shape: (1869, 1000)\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "q01Pe0BFXwXl", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "# sum(np.sum(emoji_train, axis=0) > 0)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "w66AmyT36fNY", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "input_dim2 = X_train2.shape[1]\n", | |
| "model2 = Sequential()\n", | |
| "\n", | |
| "model2.add(Dense(input_dim))\n", | |
| "model2.add(Dropout(0.2))\n", | |
| "model2.add(Activation('relu'))\n", | |
| "model2.add(Dense(200))\n", | |
| "model2.add(Dropout(0.2))\n", | |
| "model2.add(Activation('tanh'))\n", | |
| "model2.add(Dense(100))\n", | |
| "model2.add(Dropout(0.2))\n", | |
| "model2.add(Activation('sigmoid'))\n", | |
| "model2.add(Dense(3))\n", | |
| "model2.add(Activation('softmax'))\n", | |
| "\n", | |
| "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n", | |
| "\n", | |
| "model2.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "k__XRSio68V4", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 69 | |
| }, | |
| "outputId": "ebcb5fc3-d773-4e93-f679-f244b84c78c4" | |
| }, | |
| "source": [ | |
| "model2.fit(X_train2, Y_train, batch_size = 256, epochs=1)" | |
| ], | |
| "execution_count": 42, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Epoch 1/1\n", | |
| "15131/15131 [==============================] - 2s 113us/step - loss: 0.6664 - acc: 0.6436\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<keras.callbacks.History at 0x7f0f1a1f4cf8>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 42 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "sOXvqSjh7DO1", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "199fef9a-949b-421c-bce9-45a689ee336d" | |
| }, | |
| "source": [ | |
| "preds2 = model2.predict_classes(X_test2, verbose=0)\n", | |
| "np.sum(preds2==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 43, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.4071696094168004" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 43 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "6pAujGiH7T0N", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "82579038-25f0-459f-d89e-3ef10e10a1d0" | |
| }, | |
| "source": [ | |
| "preds2[:10]" | |
| ], | |
| "execution_count": 44, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 44 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "SmkqnBdkLMr6", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 52 | |
| }, | |
| "outputId": "6077f179-a37a-4102-c20d-c72029482ebe" | |
| }, | |
| "source": [ | |
| "X_train3 = X_train.copy()\n", | |
| "X_test3 = X_test.copy()\n", | |
| "\n", | |
| "emoji_train = np.zeros((X_train.shape[0], 50))\n", | |
| "emoji_test = np.zeros((X_test.shape[0], 50))\n", | |
| "\n", | |
| "i = 0\n", | |
| "for sample in train_text.split('\\n\\n'):\n", | |
| " \n", | |
| " lines_sample = sample.split('\\n')\n", | |
| " try:\n", | |
| " tmp = lines_sample[0].split()[2]\n", | |
| " tmp = lines_sample[0].split()[1]\n", | |
| " except IndexError:\n", | |
| " continue \n", | |
| " \n", | |
| " for line in lines_sample[1:]:\n", | |
| " t = line.split('\\t')\n", | |
| " for ch in t[0]:\n", | |
| " if ch in emojis:\n", | |
| " emoji_train[i][emoji_dict[ch]] += 1\n", | |
| " i += 1\n", | |
| "\n", | |
| "i = 0\n", | |
| "for sample in test_text.split('\\n\\n'):\n", | |
| " \n", | |
| " lines_sample = sample.split('\\n')\n", | |
| " try:\n", | |
| " tmp = lines_sample[0].split()[2]\n", | |
| " tmp = lines_sample[0].split()[1]\n", | |
| " except IndexError:\n", | |
| " continue\n", | |
| "\n", | |
| " for line in lines_sample[1:]:\n", | |
| " t = line.split('\\t')\n", | |
| " for ch in t[0]:\n", | |
| " if ch in emojis:\n", | |
| " emoji_test[i][emoji_dict[ch]] += 1\n", | |
| " i += 1\n", | |
| "\n", | |
| "X_train3 = np.hstack((X_train, emoji_train))\n", | |
| "X_test3 = np.hstack((X_test, emoji_test))\n", | |
| "\n", | |
| "print('X_train3 shape:', X_train3.shape)\n", | |
| "print('X_test3 shape:', X_test3.shape)" | |
| ], | |
| "execution_count": 45, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "X_train3 shape: (15131, 300)\n", | |
| "X_test3 shape: (1869, 300)\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "vH7e1na5aZT4", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "7dfc1f1e-31aa-4644-8640-ada6a9a61b1d" | |
| }, | |
| "source": [ | |
| "sum(np.sum(emoji_train, axis=1)>0)" | |
| ], | |
| "execution_count": 46, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "2112" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 46 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "K7X1G-3VOwnT", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "input_dim3 = X_train3.shape[1]\n", | |
| "model3 = Sequential()\n", | |
| "\n", | |
| "model3.add(Dense(input_dim))\n", | |
| "model3.add(Dropout(0.2))\n", | |
| "model3.add(Activation('relu'))\n", | |
| "model3.add(Dense(200))\n", | |
| "model3.add(Dropout(0.2))\n", | |
| "model3.add(Activation('tanh'))\n", | |
| "model3.add(Dense(100))\n", | |
| "model3.add(Dropout(0.2))\n", | |
| "model3.add(Activation('sigmoid'))\n", | |
| "model3.add(Dense(3))\n", | |
| "model3.add(Activation('softmax'))\n", | |
| "\n", | |
| "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n", | |
| "\n", | |
| "model3.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "sXYYCFNDPCRI", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 173 | |
| }, | |
| "outputId": "fe744d46-436c-4319-ae93-90adffb4572a" | |
| }, | |
| "source": [ | |
| "model3.fit(X_train3, Y_train, batch_size = 512, epochs=4)" | |
| ], | |
| "execution_count": 60, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Epoch 1/4\n", | |
| "15131/15131 [==============================] - 2s 115us/step - loss: 0.7066 - acc: 0.6207\n", | |
| "Epoch 2/4\n", | |
| "15131/15131 [==============================] - 1s 38us/step - loss: 0.6396 - acc: 0.6603\n", | |
| "Epoch 3/4\n", | |
| "15131/15131 [==============================] - 1s 38us/step - loss: 0.6359 - acc: 0.6626\n", | |
| "Epoch 4/4\n", | |
| "15131/15131 [==============================] - 1s 38us/step - loss: 0.6343 - acc: 0.6631\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<keras.callbacks.History at 0x7f0f19749be0>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 60 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "ETjxCbb8PHMl", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "bc8196c9-d391-4208-ac33-1e8aaf325677" | |
| }, | |
| "source": [ | |
| "preds3 = model3.predict_classes(X_test3, verbose=0)\n", | |
| "np.sum(preds3==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 61, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.32691278758694486" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 61 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "FGYJbkjcYtSn", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "ede4745b-1786-47fd-b332-b33b39553ca7" | |
| }, | |
| "source": [ | |
| "sum(sum(emoji_train))" | |
| ], | |
| "execution_count": 62, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "5102.0" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 62 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "8lGOcNgnPNKM", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "b799dff6-1721-418d-b43a-017a0c0e269c" | |
| }, | |
| "source": [ | |
| "X_train.shape" | |
| ], | |
| "execution_count": 149, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "(15131, 250)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 149 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "avVAaVEX3l1I", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from sklearn.tree import DecisionTreeClassifier" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "oBhhwjWO3pvX", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 121 | |
| }, | |
| "outputId": "1f996a56-f2af-4c3f-8f73-ff032c845a49" | |
| }, | |
| "source": [ | |
| "dtree1 = DecisionTreeClassifier()\n", | |
| "dtree1.fit(X_train, Y_train)" | |
| ], | |
| "execution_count": 65, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", | |
| " max_features=None, max_leaf_nodes=None,\n", | |
| " min_impurity_decrease=0.0, min_impurity_split=None,\n", | |
| " min_samples_leaf=1, min_samples_split=2,\n", | |
| " min_weight_fraction_leaf=0.0, presort=False,\n", | |
| " random_state=None, splitter='best')" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 65 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "l5rwDIWGKi7T", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "3a3a69e5-afea-456d-f21e-4d02da6c0a9f" | |
| }, | |
| "source": [ | |
| "predsd1 = dtree1.predict(X_test)\n", | |
| "predsd1 = np.argmax(predsd1, axis=1)\n", | |
| "np.sum(predsd1==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 66, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.3911182450508293" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 66 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "T2eJDbzBK9fq", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 121 | |
| }, | |
| "outputId": "3e561970-f213-4824-87a6-a23fab886df6" | |
| }, | |
| "source": [ | |
| "dtree2 = DecisionTreeClassifier()\n", | |
| "dtree2.fit(X_train2, Y_train)" | |
| ], | |
| "execution_count": 67, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", | |
| " max_features=None, max_leaf_nodes=None,\n", | |
| " min_impurity_decrease=0.0, min_impurity_split=None,\n", | |
| " min_samples_leaf=1, min_samples_split=2,\n", | |
| " min_weight_fraction_leaf=0.0, presort=False,\n", | |
| " random_state=None, splitter='best')" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 67 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "2IPi6lqNK8Ct", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "c63065ce-d635-4d59-a87e-605dd94c86c8" | |
| }, | |
| "source": [ | |
| "predsd2 = dtree2.predict(X_test2)\n", | |
| "predsd2 = np.argmax(predsd2, axis=1)\n", | |
| "np.sum(predsd2==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 68, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.38095238095238093" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 68 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "CKjk91n8P0Ju", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 121 | |
| }, | |
| "outputId": "963755c1-6807-4845-9db5-b43eb9f1a1d1" | |
| }, | |
| "source": [ | |
| "dtree3 = DecisionTreeClassifier()\n", | |
| "dtree3.fit(X_train3, Y_train)" | |
| ], | |
| "execution_count": 69, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", | |
| " max_features=None, max_leaf_nodes=None,\n", | |
| " min_impurity_decrease=0.0, min_impurity_split=None,\n", | |
| " min_samples_leaf=1, min_samples_split=2,\n", | |
| " min_weight_fraction_leaf=0.0, presort=False,\n", | |
| " random_state=None, splitter='best')" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 69 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Q7R4-ZYoP2_L", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "c7aa9058-d994-4adb-c4f2-1303c84effda" | |
| }, | |
| "source": [ | |
| "predsd3 = dtree3.predict(X_test3)\n", | |
| "predsd3 = np.argmax(predsd3, axis=1)\n", | |
| "np.sum(predsd3==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 70, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.39058319957196364" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 70 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "f8grLdKPQRGN", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "876bd815-1b44-443d-c8fb-00440ae0cc5d" | |
| }, | |
| "source": [ | |
| "a = X_test3[(np.sum(emoji_test, axis=1)>0)]\n", | |
| "p = y_test[(np.sum(emoji_test, axis=1)>0)]\n", | |
| "predsd4 = dtree3.predict(a)\n", | |
| "predsd4 = np.argmax(predsd4, axis=1)\n", | |
| "np.sum(predsd4==p)/len(p)" | |
| ], | |
| "execution_count": 78, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.4262295081967213" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 78 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "4A1riiirhQ5p", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from keras.layers.normalization import BatchNormalization\n", | |
| "from keras.layers import SpatialDropout1D\n", | |
| "from keras.models import Model\n", | |
| "from keras.layers import Input,Flatten, Dense, Embedding, RNN, Conv1D, BatchNormalization, MaxPooling1D, Activation, Dropout, concatenate, Lambda\n", | |
| "from keras import optimizers\n", | |
| "from keras.layers.convolutional import Convolution1D\n", | |
| "from keras import backend as K" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "gqCJcTt9hEoD", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 72 | |
| }, | |
| "outputId": "8b529ddd-651f-4f8e-d8c6-f3e96cee1811" | |
| }, | |
| "source": [ | |
| "nb_filter = 300\n", | |
| "filter_length = 3\n", | |
| "hidden_dims = 300 # 250\n", | |
| "nb_epoch = 2\n", | |
| "\n", | |
| "\n", | |
| "cmodel1 = Sequential()\n", | |
| "cmodel1.add(Embedding(max_features, 300))\n", | |
| "cmodel1.add(SpatialDropout1D(0.2))\n", | |
| "# we add a Convolution1D, which will learn nb_filter\n", | |
| "# word group filters of size filter_length:\n", | |
| "cmodel1.add(Convolution1D(nb_filter=nb_filter,\n", | |
| " filter_length=filter_length,\n", | |
| " border_mode='valid',\n", | |
| " activation='tanh',\n", | |
| " subsample_length=1))\n", | |
| "\n", | |
| "#cmodel1.add(BatchNormalization())\n", | |
| "from keras import optimizers\n", | |
| "def max_1d(X):\n", | |
| " return K.max(X, axis=1)\n", | |
| "\n", | |
| "cmodel1.add(Lambda(max_1d, output_shape=(nb_filter,)))\n", | |
| "cmodel1.add(Dense(hidden_dims))\n", | |
| "cmodel1.add(Dropout(0.2))\n", | |
| "cmodel1.add(Activation('relu'))\n", | |
| "cmodel1.add(Dense(num_classes))\n", | |
| "cmodel1.add(Activation('sigmoid'))\n", | |
| "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n", | |
| "cmodel1.compile(loss='binary_crossentropy',\n", | |
| " optimizer=adam,\n", | |
| " metrics=['accuracy'])" | |
| ], | |
| "execution_count": 106, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n", | |
| " app.launch_new_instance()\n" | |
| ], | |
| "name": "stderr" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "5SIgsZezhsJv", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 69 | |
| }, | |
| "outputId": "d31955e9-b6dd-459f-a3ec-51f9fbd2f776" | |
| }, | |
| "source": [ | |
| "cmodel1.fit(X_train3, Y_train, epochs = 1)" | |
| ], | |
| "execution_count": 107, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Epoch 1/1\n", | |
| "15131/15131 [==============================] - 237s 16ms/step - loss: 0.5342 - acc: 0.7211\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<keras.callbacks.History at 0x7f0f16bf6748>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 107 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "bPB2nA7wixq8", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "ae705a5a-4024-42d4-90b1-1460caaa1ea2" | |
| }, | |
| "source": [ | |
| "predsc1 = cmodel1.predict_classes(X_test3, verbose=0)\n", | |
| "np.sum(predsc1==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 109, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.565008025682183" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 109 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "cFj06lng5S12", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "f9b8a66d-bb98-47a2-dd14-e02571cd691b" | |
| }, | |
| "source": [ | |
| "prf(y_test, predsc1, average='micro')" | |
| ], | |
| "execution_count": 167, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "(0.565008025682183, 0.565008025682183, 0.565008025682183, None)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 167 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "OV8yft9S5n2p", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "8463406e-a582-4ac1-d737-62a6a45dee8f" | |
| }, | |
| "source": [ | |
| "prf(y_test, predsc1, average='macro')" | |
| ], | |
| "execution_count": 168, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "(0.5659138903963613, 0.5840501910447199, 0.5662953882918141, None)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 168 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "LFglzGiw5rVT", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "d8262ee5-fb84-4959-a23d-3245894fa599" | |
| }, | |
| "source": [ | |
| "prf(y_test, predsc1, average='weighted')" | |
| ], | |
| "execution_count": 169, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "(0.5675574871168725, 0.565008025682183, 0.5569762553083624, None)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 169 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "iKXK20SYjOZj", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 469 | |
| }, | |
| "outputId": "df05a7c1-d2ac-45a9-bec8-44a0b7673c43" | |
| }, | |
| "source": [ | |
| "cmodel1.summary()" | |
| ], | |
| "execution_count": 108, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Model: \"sequential_20\"\n", | |
| "_________________________________________________________________\n", | |
| "Layer (type) Output Shape Param # \n", | |
| "=================================================================\n", | |
| "embedding_12 (Embedding) (None, None, 300) 6000000 \n", | |
| "_________________________________________________________________\n", | |
| "spatial_dropout1d_11 (Spatia (None, None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "conv1d_9 (Conv1D) (None, None, 300) 270300 \n", | |
| "_________________________________________________________________\n", | |
| "lambda_9 (Lambda) (None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "dense_46 (Dense) (None, 300) 90300 \n", | |
| "_________________________________________________________________\n", | |
| "dropout_32 (Dropout) (None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "activation_46 (Activation) (None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "dense_47 (Dense) (None, 3) 903 \n", | |
| "_________________________________________________________________\n", | |
| "activation_47 (Activation) (None, 3) 0 \n", | |
| "=================================================================\n", | |
| "Total params: 6,361,503\n", | |
| "Trainable params: 6,361,503\n", | |
| "Non-trainable params: 0\n", | |
| "_________________________________________________________________\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "NWkJByaVs_ow", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 72 | |
| }, | |
| "outputId": "351f49b0-1849-47cc-ba7f-c09ab01aef60" | |
| }, | |
| "source": [ | |
| "nb_filter = 300\n", | |
| "filter_length = 3\n", | |
| "hidden_dims = 300 # 250\n", | |
| "nb_epoch = 2\n", | |
| "\n", | |
| "\n", | |
| "cmodel1a = Sequential()\n", | |
| "cmodel1a.add(Embedding(max_features, 300))\n", | |
| "cmodel1a.add(SpatialDropout1D(0.2))\n", | |
| "# we add a Convolution1D, which will learn nb_filter\n", | |
| "# word group filters of size filter_length:\n", | |
| "cmodel1a.add(Convolution1D(nb_filter=nb_filter,\n", | |
| " filter_length=filter_length,\n", | |
| " border_mode='valid',\n", | |
| " activation='tanh',\n", | |
| " subsample_length=1))\n", | |
| "\n", | |
| "cmodel1a.add(Lambda(max_1d, output_shape=(nb_filter,)))\n", | |
| "cmodel1a.add(Dense(hidden_dims))\n", | |
| "cmodel1a.add(Dropout(0.2))\n", | |
| "cmodel1a.add(Activation('relu'))\n", | |
| "cmodel1a.add(Dense(num_classes))\n", | |
| "cmodel1a.add(Activation('sigmoid'))\n", | |
| "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n", | |
| "cmodel1a.compile(loss='binary_crossentropy',\n", | |
| " optimizer=adam,\n", | |
| " metrics=['accuracy'])" | |
| ], | |
| "execution_count": 145, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n", | |
| " app.launch_new_instance()\n" | |
| ], | |
| "name": "stderr" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "OgoK_J2cjtic", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 104 | |
| }, | |
| "outputId": "7c41bb80-bfb6-4548-f280-7254855740a8" | |
| }, | |
| "source": [ | |
| "cmodel1a.fit(X_train3, Y_train, epochs = 2, batch_size=256)" | |
| ], | |
| "execution_count": 146, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Epoch 1/2\n", | |
| "15131/15131 [==============================] - 173s 11ms/step - loss: 0.5879 - acc: 0.6856\n", | |
| "Epoch 2/2\n", | |
| "15131/15131 [==============================] - 168s 11ms/step - loss: 0.4583 - acc: 0.7792\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<keras.callbacks.History at 0x7f0f1368c940>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 146 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "SgnkbkcPtJHB", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "d4ae2a11-387e-4178-fe2a-a2df44faa7c5" | |
| }, | |
| "source": [ | |
| "predsc1a = cmodel1a.predict_classes(X_test3, verbose=0)\n", | |
| "np.sum(predsc1a==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 147, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.5644729802033173" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 147 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "VcAy5qNE207S", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "from sklearn.metrics import precision_recall_fscore_support as prf" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "mvjiLm4z3CDa", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "2e407171-f2e3-436c-9f5f-c873a290962e" | |
| }, | |
| "source": [ | |
| "prf(y_test, predsc1a, average='micro')" | |
| ], | |
| "execution_count": 163, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "(0.5644729802033173, 0.5644729802033173, 0.5644729802033173, None)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 163 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "RRe5rJKX4IPQ", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "a4045c61-c53d-41f5-a146-1f37c97a9743" | |
| }, | |
| "source": [ | |
| "prf(y_test, predsc1a, average='macro')" | |
| ], | |
| "execution_count": 164, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "(0.5708475748562786, 0.563563693418115, 0.5666094135651139, None)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 164 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "KUOgZQBe4NQK", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "e9451770-bfdb-45e0-bf94-8b7973afbf0f" | |
| }, | |
| "source": [ | |
| "prf(y_test, predsc1a, average='weighted')" | |
| ], | |
| "execution_count": 166, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "(0.5663970904202026, 0.5644729802033173, 0.5648397339912556, None)" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 166 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab_type": "code", | |
| "outputId": "b8acaefe-008c-4714-9919-28d2f304d49a", | |
| "id": "anNM5TySjt77", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 72 | |
| } | |
| }, | |
| "source": [ | |
| "cmodel2 = Sequential()\n", | |
| "cmodel2.add(Embedding(max_features, 500))\n", | |
| "cmodel2.add(SpatialDropout1D(0.2))\n", | |
| "# we add a Convolution1D, which will learn nb_filter\n", | |
| "# word group filters of size filter_length:\n", | |
| "cmodel2.add(Convolution1D(nb_filter=nb_filter,\n", | |
| " filter_length=filter_length,\n", | |
| " border_mode='valid',\n", | |
| " activation='tanh',\n", | |
| " subsample_length=1))\n", | |
| "\n", | |
| "cmodel2.add(Lambda(max_1d, output_shape=(nb_filter,)))\n", | |
| "cmodel2.add(Dense(hidden_dims))\n", | |
| "cmodel2.add(Dropout(0.2))\n", | |
| "cmodel2.add(Activation('relu'))\n", | |
| "cmodel2.add(Dense(num_classes))\n", | |
| "cmodel2.add(Activation('sigmoid'))\n", | |
| "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n", | |
| "cmodel2.compile(loss='binary_crossentropy',\n", | |
| " optimizer=adam,\n", | |
| " metrics=['accuracy'])" | |
| ], | |
| "execution_count": 132, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:10: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n", | |
| " # Remove the CWD from sys.path while we load stuff.\n" | |
| ], | |
| "name": "stderr" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "L3s7q3rqj50A", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 69 | |
| }, | |
| "outputId": "0ff1d71c-e282-4cc7-b846-bacfae25dbb1" | |
| }, | |
| "source": [ | |
| "cmodel2.fit(X_train2, Y_train, epochs = 1)" | |
| ], | |
| "execution_count": 133, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Epoch 1/1\n", | |
| "15131/15131 [==============================] - 986s 65ms/step - loss: 0.5525 - acc: 0.7080\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<keras.callbacks.History at 0x7f0f13d670b8>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 133 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "oB7-3ZwTkGBE", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "5d6905c3-5f8b-4d1d-e381-632aaee0069e" | |
| }, | |
| "source": [ | |
| "predsc2 = cmodel2.predict_classes(X_test3, verbose=0)\n", | |
| "np.sum(predsc2==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 134, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.46441947565543074" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 134 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "ch95QsenkLAY", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 469 | |
| }, | |
| "outputId": "f235abbe-3114-4524-8194-cb9cdd49259b" | |
| }, | |
| "source": [ | |
| "cmodel2.summary()" | |
| ], | |
| "execution_count": 117, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Model: \"sequential_22\"\n", | |
| "_________________________________________________________________\n", | |
| "Layer (type) Output Shape Param # \n", | |
| "=================================================================\n", | |
| "embedding_14 (Embedding) (None, None, 500) 10000000 \n", | |
| "_________________________________________________________________\n", | |
| "spatial_dropout1d_13 (Spatia (None, None, 500) 0 \n", | |
| "_________________________________________________________________\n", | |
| "conv1d_11 (Conv1D) (None, None, 300) 450300 \n", | |
| "_________________________________________________________________\n", | |
| "lambda_11 (Lambda) (None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "dense_50 (Dense) (None, 300) 90300 \n", | |
| "_________________________________________________________________\n", | |
| "dropout_34 (Dropout) (None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "activation_50 (Activation) (None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "dense_51 (Dense) (None, 3) 903 \n", | |
| "_________________________________________________________________\n", | |
| "activation_51 (Activation) (None, 3) 0 \n", | |
| "=================================================================\n", | |
| "Total params: 10,541,503\n", | |
| "Trainable params: 10,541,503\n", | |
| "Non-trainable params: 0\n", | |
| "_________________________________________________________________\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "cItt6ON9TKMF", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "!pip install emoji\n", | |
| "import emoji" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "F8jVacTFkbu2", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "new_train_data = []\n", | |
| "\n", | |
| "for sample in train_text.split('\\n\\n'):\n", | |
| " \n", | |
| " lines_sample = sample.split('\\n')\n", | |
| " try:\n", | |
| " tmp = (lines_sample[0].split()[2])\n", | |
| " tmp = (lines_sample[0].split()[1])\n", | |
| " except IndexError:\n", | |
| " continue\n", | |
| " temp = []\n", | |
| " \n", | |
| " for line in lines_sample[1:]:\n", | |
| " t = line.split('\\t')\n", | |
| "\n", | |
| " if t[1] != 'O':\n", | |
| " t[0]=re.sub('[\\W_]+', '', t[0])\n", | |
| " new = ''\n", | |
| " for ch in t[0]:\n", | |
| " if ch in emojis:\n", | |
| " new += ' ' + emoji.demojize(ch) + ' '\n", | |
| " if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n", | |
| " continue\n", | |
| " if 'http' in t[0]:\n", | |
| " continue\n", | |
| " temp.append(t[0])\n", | |
| " if temp == []:\n", | |
| " continue\n", | |
| " new_train_data.append(temp)\n" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "fZOiuRo1k2M_", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "new_test_data = []\n", | |
| "# emoji_test = []\n", | |
| "for sample in test_text.split('\\n\\n'):\n", | |
| " \n", | |
| " lines_sample = sample.split('\\n')\n", | |
| " try:\n", | |
| " tmp = (lines_sample[0].split()[2])\n", | |
| " tmp = (lines_sample[0].split()[1])\n", | |
| " except IndexError:\n", | |
| " continue\n", | |
| " temp = []\n", | |
| " \n", | |
| " for line in lines_sample[1:]:\n", | |
| " t = line.split('\\t')\n", | |
| "\n", | |
| " if t[1] != 'O':\n", | |
| " t[0]=re.sub('[\\W_]+', '', t[0])\n", | |
| " new = ''\n", | |
| " for ch in t[0]:\n", | |
| " if ch in emojis:\n", | |
| " new += ' ' + emoji.demojize(ch) + ' '\n", | |
| " if t[1] == 'Eng' and t[0] in stopwords_en and t[0] not in exclude:\n", | |
| " continue\n", | |
| " if 'http' in t[0]:\n", | |
| " continue\n", | |
| " temp.append(t[0])\n", | |
| " if temp == []:\n", | |
| " continue\n", | |
| " new_test_data.append(temp)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "gAVZFFL5kSyE", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "new_train_tweets = [' '.join(i) for i in new_train_data]\n", | |
| "new_test_tweets = [' '.join(i) for i in new_test_data]" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "siNMAS_0mUfV", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "max_features = 20000\n", | |
| "tokenizer2 = Tokenizer(num_words=max_features)\n", | |
| "tokenizer2.fit_on_texts(new_train_tweets)" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "m5U5FfAKkTh_", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 52 | |
| }, | |
| "outputId": "e4529ffe-5fe1-4879-ea03-5896cb3ed70c" | |
| }, | |
| "source": [ | |
| "max_len = 250\n", | |
| "num_classes = 3\n", | |
| "\n", | |
| "new_sequences_train = tokenizer2.texts_to_sequences(new_train_tweets)\n", | |
| "new_sequences_test = tokenizer2.texts_to_sequences(new_test_tweets)\n", | |
| "\n", | |
| "X_train4 = sequence.pad_sequences(new_sequences_train, maxlen=max_len)\n", | |
| "X_test4 = sequence.pad_sequences(new_sequences_test, maxlen=max_len)\n", | |
| "\n", | |
| "# Y_train = np_utils.to_categorical(y_train, num_classes)\n", | |
| "# Y_test = np_utils.to_categorical(y_test, num_classes)\n", | |
| "\n", | |
| "print('X_train4 shape:', X_train4.shape)\n", | |
| "print('X_test4 shape:', X_test4.shape)" | |
| ], | |
| "execution_count": 126, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "X_train4 shape: (15131, 250)\n", | |
| "X_test4 shape: (1869, 250)\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "fNAfKJRYmyaR", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab_type": "code", | |
| "outputId": "2d4f0ab4-1e04-431e-e3c2-20e0f26ca7cc", | |
| "id": "mZODp4rXm0G3", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 72 | |
| } | |
| }, | |
| "source": [ | |
| "cmodel3 = Sequential()\n", | |
| "cmodel3.add(Embedding(max_features, 300))\n", | |
| "cmodel3.add(SpatialDropout1D(0.2))\n", | |
| "# we add a Convolution1D, which will learn nb_filter\n", | |
| "# word group filters of size filter_length:\n", | |
| "cmodel3.add(Convolution1D(nb_filter=nb_filter,\n", | |
| " filter_length=filter_length,\n", | |
| " border_mode='valid',\n", | |
| " activation='tanh',\n", | |
| " subsample_length=1))\n", | |
| "\n", | |
| "cmodel3.add(Lambda(max_1d, output_shape=(nb_filter,)))\n", | |
| "cmodel3.add(Dense(hidden_dims))\n", | |
| "cmodel3.add(Dropout(0.2))\n", | |
| "cmodel3.add(Activation('relu'))\n", | |
| "cmodel3.add(Dense(num_classes))\n", | |
| "cmodel3.add(Activation('sigmoid'))\n", | |
| "adam = optimizers.Adam(lr=0.001, decay=1e-6)\n", | |
| "cmodel3.compile(loss='binary_crossentropy',\n", | |
| " optimizer=adam,\n", | |
| " metrics=['accuracy'])" | |
| ], | |
| "execution_count": 127, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:10: UserWarning: Update your `Conv1D` call to the Keras 2 API: `Conv1D(activation=\"tanh\", filters=300, kernel_size=3, strides=1, padding=\"valid\")`\n", | |
| " # Remove the CWD from sys.path while we load stuff.\n" | |
| ], | |
| "name": "stderr" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "69J1sfpOm9ug", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 69 | |
| }, | |
| "outputId": "ad67b870-dbc5-4ad4-d162-509a99a116f5" | |
| }, | |
| "source": [ | |
| "cmodel3.fit(X_train4, Y_train, epochs = 1)" | |
| ], | |
| "execution_count": 129, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Epoch 1/1\n", | |
| "15131/15131 [==============================] - 216s 14ms/step - loss: 0.5303 - acc: 0.7264\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "<keras.callbacks.History at 0x7f0f143d9898>" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 129 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "HWM5IozSnrQb", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "5c630355-d5e8-42b8-826a-01d5621d4cbb" | |
| }, | |
| "source": [ | |
| "predsc3 = cmodel3.predict_classes(X_test4, verbose=0)\n", | |
| "np.sum(predsc3==y_test)/len(y_test)" | |
| ], | |
| "execution_count": 130, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0.5521669341894061" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 130 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "OSVeBC5uoD_d", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 469 | |
| }, | |
| "outputId": "ae0b085c-9980-42ba-f598-8307bd3210a3" | |
| }, | |
| "source": [ | |
| "cmodel3.summary()" | |
| ], | |
| "execution_count": 131, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Model: \"sequential_23\"\n", | |
| "_________________________________________________________________\n", | |
| "Layer (type) Output Shape Param # \n", | |
| "=================================================================\n", | |
| "embedding_15 (Embedding) (None, None, 300) 6000000 \n", | |
| "_________________________________________________________________\n", | |
| "spatial_dropout1d_14 (Spatia (None, None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "conv1d_12 (Conv1D) (None, None, 300) 270300 \n", | |
| "_________________________________________________________________\n", | |
| "lambda_12 (Lambda) (None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "dense_52 (Dense) (None, 300) 90300 \n", | |
| "_________________________________________________________________\n", | |
| "dropout_35 (Dropout) (None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "activation_52 (Activation) (None, 300) 0 \n", | |
| "_________________________________________________________________\n", | |
| "dense_53 (Dense) (None, 3) 903 \n", | |
| "_________________________________________________________________\n", | |
| "activation_53 (Activation) (None, 3) 0 \n", | |
| "=================================================================\n", | |
| "Total params: 6,361,503\n", | |
| "Trainable params: 6,361,503\n", | |
| "Non-trainable params: 0\n", | |
| "_________________________________________________________________\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "5-B6qEc6BJJJ", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 191 | |
| }, | |
| "outputId": "6e4a8b2b-0fbc-429c-a907-2caa8eac5d8e" | |
| }, | |
| "source": [ | |
| "print(train_text[:100])" | |
| ], | |
| "execution_count": 126, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "meta\t3\tnegative\n", | |
| "@\tO\n", | |
| "AdilNisarButt\tHin\n", | |
| "pakistan\tHin\n", | |
| "ka\tHin\n", | |
| "ghra\tHin\n", | |
| "tauq\tHin\n", | |
| "he\tEng\n", | |
| "Pakistan\tEng\n", | |
| "Isra\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "XHN9MSnd5wcC", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 121 | |
| }, | |
| "outputId": "282da6de-3570-453a-9c8b-31166287f94e" | |
| }, | |
| "source": [ | |
| "a = np.array([[1,1,1,1], [2,2,2,2]])\n", | |
| "b = np.array([[3,3],[4,4]])\n", | |
| "print(a)\n", | |
| "print(b)\n", | |
| "print(np.hstack((a,b)))" | |
| ], | |
| "execution_count": 57, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "[[1 1 1 1]\n", | |
| " [2 2 2 2]]\n", | |
| "[[3 3]\n", | |
| " [4 4]]\n", | |
| "[[1 1 1 1 3 3]\n", | |
| " [2 2 2 2 4 4]]\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment