jgc128/keras_sequence_labeling.ipynb

## keras_sequence_labeling.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using Theano backend.\n",
      "/data1/aromanov/virt_env2/keras_latest/lib/python3.4/site-packages/theano/tensor/signal/downsample.py:5: UserWarning: downsample module has been moved to the pool module.\n",
      "  warnings.warn(\"downsample module has been moved to the pool module.\")\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import pickle\n",
    "from itertools import chain\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.metrics import f1_score, accuracy_score\n",
    "\n",
    "from keras.utils.np_utils import to_categorical\n",
    "from keras.models import Sequential\n",
    "from keras.layers.core import Dense, RepeatVector, TimeDistributedDense, Masking, Activation, RepeatVector\n",
    "from keras.layers.embeddings import Embedding\n",
    "from keras.layers.recurrent import LSTM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from nltk.corpus import brown"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create training data from the brown corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "nb_brown_samples = 5000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "brown_corpus = brown.tagged_sents(tagset='universal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "brown_corpus_selected = list(brown_corpus[0:nb_brown_samples])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentences: 5000\n"
     ]
    }
   ],
   "source": [
    "print('Sentences:', len(brown_corpus_selected))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('The', 'DET'),\n",
       " ('Fulton', 'NOUN'),\n",
       " ('County', 'NOUN'),\n",
       " ('Grand', 'ADJ'),\n",
       " ('Jury', 'NOUN'),\n",
       " ('said', 'VERB'),\n",
       " ('Friday', 'NOUN'),\n",
       " ('an', 'DET'),\n",
       " ('investigation', 'NOUN'),\n",
       " ('of', 'ADP'),\n",
       " (\"Atlanta's\", 'NOUN'),\n",
       " ('recent', 'ADJ'),\n",
       " ('primary', 'NOUN'),\n",
       " ('election', 'NOUN'),\n",
       " ('produced', 'VERB'),\n",
       " ('``', '.'),\n",
       " ('no', 'DET'),\n",
       " ('evidence', 'NOUN'),\n",
       " (\"''\", '.'),\n",
       " ('that', 'ADP'),\n",
       " ('any', 'DET'),\n",
       " ('irregularities', 'NOUN'),\n",
       " ('took', 'VERB'),\n",
       " ('place', 'NOUN'),\n",
       " ('.', '.')]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "brown_corpus_selected[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('His', 'DET'),\n",
       " ('petition', 'NOUN'),\n",
       " ('charged', 'VERB'),\n",
       " ('mental', 'ADJ'),\n",
       " ('cruelty', 'NOUN'),\n",
       " ('.', '.')]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "brown_corpus_selected[31]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "brown_train, brown_test = train_test_split(brown_corpus_selected, test_size=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: 3750\n",
      "Test: 1250\n"
     ]
    }
   ],
   "source": [
    "print('Train:', len(brown_train))\n",
    "print('Test:', len(brown_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Construct *tok2id* and *label2id*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    " # i+1 because 0 will be the maskng character\n",
    "tok2id = {t:i+1 for i,t in enumerate(set(t[0].lower() for seq in brown_corpus_selected for t in seq))}\n",
    "\n",
    "label2id = {t:i for i,t in enumerate(set(t[1] for seq in brown_corpus_selected for t in seq))}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tokens: 13735\n",
      "Labels: 12\n"
     ]
    }
   ],
   "source": [
    "print('Tokens:', len(tok2id))\n",
    "print('Labels:', len(label2id))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert brown corpus to input matricies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_XY_seq(data, tok2id, label2id):\n",
    "    nb_classes = len(label2id)\n",
    "    \n",
    "    # data[0] - a sentence\n",
    "    # data[0][0] - a pair (token, tag)\n",
    "    \n",
    "    # X_seq - a list of sentences, each sentences - a list of tokens\n",
    "    X_seq = [[t[0].lower() for t in seq] for seq in data]\n",
    "    \n",
    "    # convert tokens to id\n",
    "    X_seq_ids = [[tok2id[l] for l in seq] for seq in X_seq]\n",
    "    \n",
    "    \n",
    "    # Y_seq - a list of sequences, each sequence - a list of labels\n",
    "    Y_seq = [[t[1] for t in seq] for seq in data]\n",
    "    \n",
    "    # convert tokens to id\n",
    "    Y_seq_ids = [[label2id[l] for l in seq] for seq in Y_seq]\n",
    "    \n",
    "    # create one-hot representation for labels\n",
    "    Y_seq_cat = [to_categorical(seq, nb_classes=nb_classes) for seq in Y_seq_ids]\n",
    "    \n",
    "    return X_seq_ids, Y_seq_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_seq_train, Y_seq_train = create_XY_seq(brown_train, tok2id, label2id)\n",
    "X_seq_test, Y_seq_test = create_XY_seq(brown_test, tok2id, label2id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: 3750 3750\n",
      "Test: 1250 1250\n"
     ]
    }
   ],
   "source": [
    "print('Train:', len(X_seq_train), len(Y_seq_train))\n",
    "print('Test:', len(X_seq_test), len(Y_seq_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[253,\n",
       " 11462,\n",
       " 7707,\n",
       " 4952,\n",
       " 11759,\n",
       " 3983,\n",
       " 12227,\n",
       " 2286,\n",
       " 3993,\n",
       " 8386,\n",
       " 12227,\n",
       " 9631,\n",
       " 1959,\n",
       " 2286,\n",
       " 6550,\n",
       " 12230,\n",
       " 13251,\n",
       " 730,\n",
       " 10213,\n",
       " 10598,\n",
       " 3185]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_seq_train[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],\n",
       "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],\n",
       "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y_seq_train[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create data matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Input to the network - 2d tensor with shape `(nb_samples, sequence_length)`\n",
    "\n",
    "Output of the network - 3d tensor with shape `(nb_samples, timesteps, nb_classes)`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "nb_samples_train = len(X_seq_train)\n",
    "nb_samples_test = len(X_seq_test)\n",
    "\n",
    "nb_classes = len(label2id)\n",
    "maxlen = max(chain((len(seq) for seq in X_seq_train), (len(seq) for seq in X_seq_test)))\n",
    "input_dim = max(tok2id.values()) + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nb_samples_train: 3750\n",
      "nb_samples_test: 1250\n",
      "nb_classes: 12\n",
      "maxlen: 102\n"
     ]
    }
   ],
   "source": [
    "print('nb_samples_train:', nb_samples_train)\n",
    "print('nb_samples_test:', nb_samples_test)\n",
    "print('nb_classes:', nb_classes)\n",
    "print('maxlen:', maxlen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_data_matrix(X_seq, Y_seq, nb_samples, maxlen, nb_classes):\n",
    "    X = np.zeros((nb_samples, maxlen))\n",
    "    Y = np.zeros((nb_samples, maxlen, nb_classes))\n",
    "    \n",
    "    for i in range(nb_samples):\n",
    "        cur_len = len(X_seq[i])\n",
    "        \n",
    "        # We pad on the left with zeros, so for short sentences the first elemnts in the matrix are zeros \n",
    "        X[i, maxlen - cur_len:] = X_seq[i]\n",
    "        Y[i, maxlen - cur_len:, :] = Y_seq[i]\n",
    "    \n",
    "    return X, Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train, Y_train = create_data_matrix(X_seq_train, Y_seq_train, nb_samples_train, maxlen, nb_classes)\n",
    "X_test, Y_test = create_data_matrix(X_seq_test, Y_seq_test, nb_samples_test, maxlen, nb_classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: (3750, 102) (3750, 102, 12)\n",
      "Test: (1250, 102) (1250, 102, 12)\n"
     ]
    }
   ],
   "source": [
    "print('Train:', X_train.shape, Y_train.shape)\n",
    "print('Test:', X_test.shape, Y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[     0.,      0.,      0., ...,  10213.,  10598.,   3185.],\n",
       "       [     0.,      0.,      0., ...,  11759.,  12732.,   3185.],\n",
       "       [     0.,      0.,      0., ...,   5503.,   8139.,   3185.],\n",
       "       ..., \n",
       "       [     0.,      0.,      0., ...,  11759.,   5088.,   3185.],\n",
       "       [     0.,      0.,      0., ...,   9025.,   7550.,   3185.],\n",
       "       [     0.,      0.,      0., ...,   6795.,   2419.,   3185.]])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        ..., \n",
       "        [ 0.,  0.,  1., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],\n",
       "\n",
       "       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        ..., \n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],\n",
       "\n",
       "       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        ..., \n",
       "        [ 0.,  0.,  0., ...,  1.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],\n",
       "\n",
       "       ..., \n",
       "       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        ..., \n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],\n",
       "\n",
       "       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        ..., \n",
       "        [ 0.,  0.,  1., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],\n",
       "\n",
       "       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        ..., \n",
       "        [ 0.,  0.,  0., ...,  1.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y_train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build a Keras model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "batch_size = 128\n",
    "nb_epoch = 15"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = Sequential()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Embedding layer converts the input sequences into a sequence of vectors with dim 300\n",
    "model.add(Embedding(output_dim=200, input_dim=input_dim, input_length=maxlen, mask_zero=True))\n",
    "\n",
    "# LSTM input: (nb_samples, timesteps, input_dim)\n",
    "model.add(LSTM(output_dim=128, return_sequences=True))\n",
    "\n",
    "# TimeDistributedDense is a fully-connected layer that apply the same weight matrix at each timestep\n",
    "model.add(TimeDistributedDense(output_dim=50, activation='sigmoid'))\n",
    "\n",
    "model.add(TimeDistributedDense(output_dim=nb_classes, activation='softmax'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='categorical_crossentropy', optimizer='adam')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/15\n",
      "3750/3750 [==============================] - 20s - loss: 2.2656    \n",
      "Epoch 2/15\n",
      "3750/3750 [==============================] - 20s - loss: 2.0316    \n",
      "Epoch 3/15\n",
      "3750/3750 [==============================] - 20s - loss: 1.7507    \n",
      "Epoch 4/15\n",
      "3750/3750 [==============================] - 20s - loss: 1.1898    \n",
      "Epoch 5/15\n",
      "3750/3750 [==============================] - 21s - loss: 0.7586    \n",
      "Epoch 6/15\n",
      "3750/3750 [==============================] - 23s - loss: 0.5090    \n",
      "Epoch 7/15\n",
      "3750/3750 [==============================] - 22s - loss: 0.3740    \n",
      "Epoch 8/15\n",
      "3750/3750 [==============================] - 23s - loss: 0.2862    \n",
      "Epoch 9/15\n",
      "3750/3750 [==============================] - 23s - loss: 0.2262    \n",
      "Epoch 10/15\n",
      "3750/3750 [==============================] - 20s - loss: 0.1852    \n",
      "Epoch 11/15\n",
      "3750/3750 [==============================] - 20s - loss: 0.1559    \n",
      "Epoch 12/15\n",
      "3750/3750 [==============================] - 20s - loss: 0.1339    \n",
      "Epoch 13/15\n",
      "3750/3750 [==============================] - 20s - loss: 0.1168    \n",
      "Epoch 14/15\n",
      "3750/3750 [==============================] - 20s - loss: 0.1029    \n",
      "Epoch 15/15\n",
      "3750/3750 [==============================] - 20s - loss: 0.0921    \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f7a336b1278>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1250/1250 [==============================] - 1s     \n"
     ]
    }
   ],
   "source": [
    "pred_classes = model.predict_classes(X_test, batch_size=batch_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pred_classes: (1250, 102)\n"
     ]
    }
   ],
   "source": [
    "print('pred_classes:', pred_classes.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[4, 4, 4, ..., 9, 4, 7],\n",
       "       [4, 4, 4, ..., 3, 5, 7],\n",
       "       [4, 4, 4, ..., 3, 4, 7],\n",
       "       ..., \n",
       "       [4, 4, 4, ..., 4, 4, 7],\n",
       "       [4, 4, 4, ..., 2, 4, 7],\n",
       "       [4, 4, 4, ..., 4, 4, 7]])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_classes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calc accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_seq_test_len = [len(seq) for seq in X_seq_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# concatenate all sequence together to get a long list of labels\n",
    "# we do not want to consider the starting padding, so we need to get only the actual sentences\n",
    "y_true = [Y_seq_test[i][j].argmax() for i in range(nb_samples_test) for j in range(X_seq_test_len[i])]\n",
    "y_pred = [pred_classes[i,maxlen - X_seq_test_len[i] + j] for i in range(nb_samples_test) for j in range(X_seq_test_len[i])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "y true: 26614\n",
      "y pred: 26614\n"
     ]
    }
   ],
   "source": [
    "print('y true:', len(y_true))\n",
    "print('y pred:', len(y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "acc = accuracy_score(y_true, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.941121214398\n"
     ]
    }
   ],
   "source": [
    "print('Accuracy:', acc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Using Theano backend.\n",
	"/data1/aromanov/virt_env2/keras_latest/lib/python3.4/site-packages/theano/tensor/signal/downsample.py:5: UserWarning: downsample module has been moved to the pool module.\n",
	" warnings.warn(\"downsample module has been moved to the pool module.\")\n"
	]
	}
	],
	"source": [
	"import os\n",
	"import pickle\n",
	"from itertools import chain\n",
	"\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"\n",
	"from sklearn.cross_validation import train_test_split\n",
	"from sklearn.metrics import f1_score, accuracy_score\n",
	"\n",
	"from keras.utils.np_utils import to_categorical\n",
	"from keras.models import Sequential\n",
	"from keras.layers.core import Dense, RepeatVector, TimeDistributedDense, Masking, Activation, RepeatVector\n",
	"from keras.layers.embeddings import Embedding\n",
	"from keras.layers.recurrent import LSTM"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from nltk.corpus import brown"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Create training data from the brown corpus"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"nb_brown_samples = 5000"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"brown_corpus = brown.tagged_sents(tagset='universal')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"brown_corpus_selected = list(brown_corpus[0:nb_brown_samples])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Sentences: 5000\n"
	]
	}
	],
	"source": [
	"print('Sentences:', len(brown_corpus_selected))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('The', 'DET'),\n",
	" ('Fulton', 'NOUN'),\n",
	" ('County', 'NOUN'),\n",
	" ('Grand', 'ADJ'),\n",
	" ('Jury', 'NOUN'),\n",
	" ('said', 'VERB'),\n",
	" ('Friday', 'NOUN'),\n",
	" ('an', 'DET'),\n",
	" ('investigation', 'NOUN'),\n",
	" ('of', 'ADP'),\n",
	" (\"Atlanta's\", 'NOUN'),\n",
	" ('recent', 'ADJ'),\n",
	" ('primary', 'NOUN'),\n",
	" ('election', 'NOUN'),\n",
	" ('produced', 'VERB'),\n",
	" ('``', '.'),\n",
	" ('no', 'DET'),\n",
	" ('evidence', 'NOUN'),\n",
	" (\"''\", '.'),\n",
	" ('that', 'ADP'),\n",
	" ('any', 'DET'),\n",
	" ('irregularities', 'NOUN'),\n",
	" ('took', 'VERB'),\n",
	" ('place', 'NOUN'),\n",
	" ('.', '.')]"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"brown_corpus_selected[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('His', 'DET'),\n",
	" ('petition', 'NOUN'),\n",
	" ('charged', 'VERB'),\n",
	" ('mental', 'ADJ'),\n",
	" ('cruelty', 'NOUN'),\n",
	" ('.', '.')]"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"brown_corpus_selected[31]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"brown_train, brown_test = train_test_split(brown_corpus_selected, test_size=0.25)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train: 3750\n",
	"Test: 1250\n"
	]
	}
	],
	"source": [
	"print('Train:', len(brown_train))\n",
	"print('Test:', len(brown_test))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Construct tok2id and label2id"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	" # i+1 because 0 will be the maskng character\n",
	"tok2id = {t:i+1 for i,t in enumerate(set(t[0].lower() for seq in brown_corpus_selected for t in seq))}\n",
	"\n",
	"label2id = {t:i for i,t in enumerate(set(t[1] for seq in brown_corpus_selected for t in seq))}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Tokens: 13735\n",
	"Labels: 12\n"
	]
	}
	],
	"source": [
	"print('Tokens:', len(tok2id))\n",
	"print('Labels:', len(label2id))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Convert brown corpus to input matricies"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def create_XY_seq(data, tok2id, label2id):\n",
	" nb_classes = len(label2id)\n",
	" \n",
	" # data[0] - a sentence\n",
	" # data[0][0] - a pair (token, tag)\n",
	" \n",
	" # X_seq - a list of sentences, each sentences - a list of tokens\n",
	" X_seq = [[t[0].lower() for t in seq] for seq in data]\n",
	" \n",
	" # convert tokens to id\n",
	" X_seq_ids = [[tok2id[l] for l in seq] for seq in X_seq]\n",
	" \n",
	" \n",
	" # Y_seq - a list of sequences, each sequence - a list of labels\n",
	" Y_seq = [[t[1] for t in seq] for seq in data]\n",
	" \n",
	" # convert tokens to id\n",
	" Y_seq_ids = [[label2id[l] for l in seq] for seq in Y_seq]\n",
	" \n",
	" # create one-hot representation for labels\n",
	" Y_seq_cat = [to_categorical(seq, nb_classes=nb_classes) for seq in Y_seq_ids]\n",
	" \n",
	" return X_seq_ids, Y_seq_cat"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X_seq_train, Y_seq_train = create_XY_seq(brown_train, tok2id, label2id)\n",
	"X_seq_test, Y_seq_test = create_XY_seq(brown_test, tok2id, label2id)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train: 3750 3750\n",
	"Test: 1250 1250\n"
	]
	}
	],
	"source": [
	"print('Train:', len(X_seq_train), len(Y_seq_train))\n",
	"print('Test:', len(X_seq_test), len(Y_seq_test))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[253,\n",
	" 11462,\n",
	" 7707,\n",
	" 4952,\n",
	" 11759,\n",
	" 3983,\n",
	" 12227,\n",
	" 2286,\n",
	" 3993,\n",
	" 8386,\n",
	" 12227,\n",
	" 9631,\n",
	" 1959,\n",
	" 2286,\n",
	" 6550,\n",
	" 12230,\n",
	" 13251,\n",
	" 730,\n",
	" 10213,\n",
	" 10598,\n",
	" 3185]"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"X_seq_train[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],\n",
	" [ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],\n",
	" [ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],\n",
	" [ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
	" [ 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"Y_seq_train[0]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Create data matrix"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Input to the network - 2d tensor with shape `(nb_samples, sequence_length)`\n",
	"\n",
	"Output of the network - 3d tensor with shape `(nb_samples, timesteps, nb_classes)`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"nb_samples_train = len(X_seq_train)\n",
	"nb_samples_test = len(X_seq_test)\n",
	"\n",
	"nb_classes = len(label2id)\n",
	"maxlen = max(chain((len(seq) for seq in X_seq_train), (len(seq) for seq in X_seq_test)))\n",
	"input_dim = max(tok2id.values()) + 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"nb_samples_train: 3750\n",
	"nb_samples_test: 1250\n",
	"nb_classes: 12\n",
	"maxlen: 102\n"
	]
	}
	],
	"source": [
	"print('nb_samples_train:', nb_samples_train)\n",
	"print('nb_samples_test:', nb_samples_test)\n",
	"print('nb_classes:', nb_classes)\n",
	"print('maxlen:', maxlen)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def create_data_matrix(X_seq, Y_seq, nb_samples, maxlen, nb_classes):\n",
	" X = np.zeros((nb_samples, maxlen))\n",
	" Y = np.zeros((nb_samples, maxlen, nb_classes))\n",
	" \n",
	" for i in range(nb_samples):\n",
	" cur_len = len(X_seq[i])\n",
	" \n",
	" # We pad on the left with zeros, so for short sentences the first elemnts in the matrix are zeros \n",
	" X[i, maxlen - cur_len:] = X_seq[i]\n",
	" Y[i, maxlen - cur_len:, :] = Y_seq[i]\n",
	" \n",
	" return X, Y"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"X_train, Y_train = create_data_matrix(X_seq_train, Y_seq_train, nb_samples_train, maxlen, nb_classes)\n",
	"X_test, Y_test = create_data_matrix(X_seq_test, Y_seq_test, nb_samples_test, maxlen, nb_classes)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train: (3750, 102) (3750, 102, 12)\n",
	"Test: (1250, 102) (1250, 102, 12)\n"
	]
	}
	],
	"source": [
	"print('Train:', X_train.shape, Y_train.shape)\n",
	"print('Test:', X_test.shape, Y_test.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 0., 0., 0., ..., 10213., 10598., 3185.],\n",
	" [ 0., 0., 0., ..., 11759., 12732., 3185.],\n",
	" [ 0., 0., 0., ..., 5503., 8139., 3185.],\n",
	" ..., \n",
	" [ 0., 0., 0., ..., 11759., 5088., 3185.],\n",
	" [ 0., 0., 0., ..., 9025., 7550., 3185.],\n",
	" [ 0., 0., 0., ..., 6795., 2419., 3185.]])"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"X_train"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[[ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" ..., \n",
	" [ 0., 0., 1., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.]],\n",
	"\n",
	" [[ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" ..., \n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.]],\n",
	"\n",
	" [[ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" ..., \n",
	" [ 0., 0., 0., ..., 1., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.]],\n",
	"\n",
	" ..., \n",
	" [[ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" ..., \n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.]],\n",
	"\n",
	" [[ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" ..., \n",
	" [ 0., 0., 1., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.]],\n",
	"\n",
	" [[ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" ..., \n",
	" [ 0., 0., 0., ..., 1., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.]]])"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"Y_train"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Build a Keras model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"batch_size = 128\n",
	"nb_epoch = 15"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"model = Sequential()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Embedding layer converts the input sequences into a sequence of vectors with dim 300\n",
	"model.add(Embedding(output_dim=200, input_dim=input_dim, input_length=maxlen, mask_zero=True))\n",
	"\n",
	"# LSTM input: (nb_samples, timesteps, input_dim)\n",
	"model.add(LSTM(output_dim=128, return_sequences=True))\n",
	"\n",
	"# TimeDistributedDense is a fully-connected layer that apply the same weight matrix at each timestep\n",
	"model.add(TimeDistributedDense(output_dim=50, activation='sigmoid'))\n",
	"\n",
	"model.add(TimeDistributedDense(output_dim=nb_classes, activation='softmax'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"model.compile(loss='categorical_crossentropy', optimizer='adam')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 1/15\n",
	"3750/3750 [==============================] - 20s - loss: 2.2656 \n",
	"Epoch 2/15\n",
	"3750/3750 [==============================] - 20s - loss: 2.0316 \n",
	"Epoch 3/15\n",
	"3750/3750 [==============================] - 20s - loss: 1.7507 \n",
	"Epoch 4/15\n",
	"3750/3750 [==============================] - 20s - loss: 1.1898 \n",
	"Epoch 5/15\n",
	"3750/3750 [==============================] - 21s - loss: 0.7586 \n",
	"Epoch 6/15\n",
	"3750/3750 [==============================] - 23s - loss: 0.5090 \n",
	"Epoch 7/15\n",
	"3750/3750 [==============================] - 22s - loss: 0.3740 \n",
	"Epoch 8/15\n",
	"3750/3750 [==============================] - 23s - loss: 0.2862 \n",
	"Epoch 9/15\n",
	"3750/3750 [==============================] - 23s - loss: 0.2262 \n",
	"Epoch 10/15\n",
	"3750/3750 [==============================] - 20s - loss: 0.1852 \n",
	"Epoch 11/15\n",
	"3750/3750 [==============================] - 20s - loss: 0.1559 \n",
	"Epoch 12/15\n",
	"3750/3750 [==============================] - 20s - loss: 0.1339 \n",
	"Epoch 13/15\n",
	"3750/3750 [==============================] - 20s - loss: 0.1168 \n",
	"Epoch 14/15\n",
	"3750/3750 [==============================] - 20s - loss: 0.1029 \n",
	"Epoch 15/15\n",
	"3750/3750 [==============================] - 20s - loss: 0.0921 \n"
	]
	},
	{
	"data": {
	"text/plain": [
	"<keras.callbacks.History at 0x7f7a336b1278>"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1250/1250 [==============================] - 1s \n"
	]
	}
	],
	"source": [
	"pred_classes = model.predict_classes(X_test, batch_size=batch_size)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"pred_classes: (1250, 102)\n"
	]
	}
	],
	"source": [
	"print('pred_classes:', pred_classes.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[4, 4, 4, ..., 9, 4, 7],\n",
	" [4, 4, 4, ..., 3, 5, 7],\n",
	" [4, 4, 4, ..., 3, 4, 7],\n",
	" ..., \n",
	" [4, 4, 4, ..., 4, 4, 7],\n",
	" [4, 4, 4, ..., 2, 4, 7],\n",
	" [4, 4, 4, ..., 4, 4, 7]])"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pred_classes"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Calc accuracy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X_seq_test_len = [len(seq) for seq in X_seq_test]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# concatenate all sequence together to get a long list of labels\n",
	"# we do not want to consider the starting padding, so we need to get only the actual sentences\n",
	"y_true = [Y_seq_test[i][j].argmax() for i in range(nb_samples_test) for j in range(X_seq_test_len[i])]\n",
	"y_pred = [pred_classes[i,maxlen - X_seq_test_len[i] + j] for i in range(nb_samples_test) for j in range(X_seq_test_len[i])]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"y true: 26614\n",
	"y pred: 26614\n"
	]
	}
	],
	"source": [
	"print('y true:', len(y_true))\n",
	"print('y pred:', len(y_pred))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"acc = accuracy_score(y_true, y_pred)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Accuracy: 0.941121214398\n"
	]
	}
	],
	"source": [
	"print('Accuracy:', acc)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
No results found