Created
January 27, 2014 17:38
-
-
Save mickaellegal/8653451 to your computer and use it in GitHub Desktop.
iPython Notebook: 50onRed test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "heading", | |
| "level": 1, | |
| "metadata": {}, | |
| "source": [ | |
| "Importing the libaries" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Importing the libraries\n", | |
| "import pandas as pd\n", | |
| "import numpy as np" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 55 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import warnings\n", | |
| "warnings.filterwarnings(\"ignore\")" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 39 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 1, | |
| "metadata": {}, | |
| "source": [ | |
| "Loading and formatting the data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Loading the training set \n", | |
| "train_data = pd.read_csv(\"training-data-set.csv\", sep=\" \")\n", | |
| "\n", | |
| "train_data.head(10)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>cat1</th>\n", | |
| " <th>cat2</th>\n", | |
| " <th>cat3</th>\n", | |
| " <th>cat4</th>\n", | |
| " <th>cat5</th>\n", | |
| " <th>cat6</th>\n", | |
| " <th>cat7</th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> b</td>\n", | |
| " <td> 1.053900</td>\n", | |
| " <td>-0.062460</td>\n", | |
| " <td> 0.508648</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> y</td>\n", | |
| " <td> y</td>\n", | |
| " <td> a</td>\n", | |
| " <td> d</td>\n", | |
| " <td>-0.575898</td>\n", | |
| " <td> 1.053315</td>\n", | |
| " <td> 2.100263</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td> y</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> a</td>\n", | |
| " <td> 0.392731</td>\n", | |
| " <td>-0.395918</td>\n", | |
| " <td> 1.813869</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td> n</td>\n", | |
| " <td> y</td>\n", | |
| " <td> c</td>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> d</td>\n", | |
| " <td> 1.255048</td>\n", | |
| " <td> 0.812365</td>\n", | |
| " <td> 0.115558</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td> n</td>\n", | |
| " <td> y</td>\n", | |
| " <td> a</td>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> a</td>\n", | |
| " <td>-0.848028</td>\n", | |
| " <td> 1.575932</td>\n", | |
| " <td> 0.407990</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> n</td>\n", | |
| " <td> y</td>\n", | |
| " <td> a</td>\n", | |
| " <td> c</td>\n", | |
| " <td>-2.000425</td>\n", | |
| " <td> 0.168658</td>\n", | |
| " <td> 1.089865</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td> n</td>\n", | |
| " <td> y</td>\n", | |
| " <td> c</td>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> d</td>\n", | |
| " <td> 1.986990</td>\n", | |
| " <td> 0.100123</td>\n", | |
| " <td>-0.156572</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> c</td>\n", | |
| " <td> a</td>\n", | |
| " <td> 0.179694</td>\n", | |
| " <td>-0.207595</td>\n", | |
| " <td> 0.150446</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td> n</td>\n", | |
| " <td> y</td>\n", | |
| " <td> b</td>\n", | |
| " <td> n</td>\n", | |
| " <td> y</td>\n", | |
| " <td> a</td>\n", | |
| " <td> a</td>\n", | |
| " <td>-0.287543</td>\n", | |
| " <td> 1.227005</td>\n", | |
| " <td> 1.037588</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td> n</td>\n", | |
| " <td> y</td>\n", | |
| " <td> a</td>\n", | |
| " <td> n</td>\n", | |
| " <td> y</td>\n", | |
| " <td> b</td>\n", | |
| " <td> a</td>\n", | |
| " <td> 0.018208</td>\n", | |
| " <td>-0.942384</td>\n", | |
| " <td>-0.494788</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 12, | |
| "text": [ | |
| " cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n", | |
| "0 n n a n n a b 1.053900 -0.062460 0.508648\n", | |
| "1 n n a y y a d -0.575898 1.053315 2.100263\n", | |
| "2 y n a n n a a 0.392731 -0.395918 1.813869\n", | |
| "3 n y c n n a d 1.255048 0.812365 0.115558\n", | |
| "4 n y a n n a a -0.848028 1.575932 0.407990\n", | |
| "5 n n a n y a c -2.000425 0.168658 1.089865\n", | |
| "6 n y c n n a d 1.986990 0.100123 -0.156572\n", | |
| "7 n n a n n c a 0.179694 -0.207595 0.150446\n", | |
| "8 n y b n y a a -0.287543 1.227005 1.037588\n", | |
| "9 n y a n y b a 0.018208 -0.942384 -0.494788" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 12 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Converting the categorical variables into numercial variables\n", | |
| "categorical_values = set()\n", | |
| "for i in train_data['cat1']:\n", | |
| " categorical_values.add(i)\n", | |
| "for j in train_data['cat7']:\n", | |
| " categorical_values.add(j)\n", | |
| " \n", | |
| "print categorical_values " | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "set(['a', 'c', 'b', 'd', 'n', 'y'])\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 30 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# We assign a numerical value to each of the categorical values\n", | |
| "train_data = train_data.replace(['a','b','c','d','n','y'], [1,2,3,4,5,6])\n", | |
| "\n", | |
| "train_data.head(10)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>cat1</th>\n", | |
| " <th>cat2</th>\n", | |
| " <th>cat3</th>\n", | |
| " <th>cat4</th>\n", | |
| " <th>cat5</th>\n", | |
| " <th>cat6</th>\n", | |
| " <th>cat7</th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 1.053900</td>\n", | |
| " <td>-0.062460</td>\n", | |
| " <td> 0.508648</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 4</td>\n", | |
| " <td>-0.575898</td>\n", | |
| " <td> 1.053315</td>\n", | |
| " <td> 2.100263</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td> 6</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0.392731</td>\n", | |
| " <td>-0.395918</td>\n", | |
| " <td> 1.813869</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 4</td>\n", | |
| " <td> 1.255048</td>\n", | |
| " <td> 0.812365</td>\n", | |
| " <td> 0.115558</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>-0.848028</td>\n", | |
| " <td> 1.575932</td>\n", | |
| " <td> 0.407990</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 3</td>\n", | |
| " <td>-2.000425</td>\n", | |
| " <td> 0.168658</td>\n", | |
| " <td> 1.089865</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 4</td>\n", | |
| " <td> 1.986990</td>\n", | |
| " <td> 0.100123</td>\n", | |
| " <td>-0.156572</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0.179694</td>\n", | |
| " <td>-0.207595</td>\n", | |
| " <td> 0.150446</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>-0.287543</td>\n", | |
| " <td> 1.227005</td>\n", | |
| " <td> 1.037588</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0.018208</td>\n", | |
| " <td>-0.942384</td>\n", | |
| " <td>-0.494788</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 31, | |
| "text": [ | |
| " cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n", | |
| "0 5 5 1 5 5 1 2 1.053900 -0.062460 0.508648\n", | |
| "1 5 5 1 6 6 1 4 -0.575898 1.053315 2.100263\n", | |
| "2 6 5 1 5 5 1 1 0.392731 -0.395918 1.813869\n", | |
| "3 5 6 3 5 5 1 4 1.255048 0.812365 0.115558\n", | |
| "4 5 6 1 5 5 1 1 -0.848028 1.575932 0.407990\n", | |
| "5 5 5 1 5 6 1 3 -2.000425 0.168658 1.089865\n", | |
| "6 5 6 3 5 5 1 4 1.986990 0.100123 -0.156572\n", | |
| "7 5 5 1 5 5 3 1 0.179694 -0.207595 0.150446\n", | |
| "8 5 6 2 5 6 1 1 -0.287543 1.227005 1.037588\n", | |
| "9 5 6 1 5 6 2 1 0.018208 -0.942384 -0.494788" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 31 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Loading the training labels\n", | |
| "\n", | |
| "train_labels = pd.read_csv(\"training-data-labels.csv\")\n", | |
| "\n", | |
| "train_labels.head(5)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>label</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> 0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> 1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td> 1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td> 0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td> 1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 7, | |
| "text": [ | |
| " label\n", | |
| "0 0\n", | |
| "1 1\n", | |
| "2 1\n", | |
| "3 0\n", | |
| "4 1" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Loading the test set\n", | |
| "\n", | |
| "test_data = pd.read_csv(\"test-data-set.csv\", sep=\" \")\n", | |
| "\n", | |
| "test_data.head(5)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>cat1</th>\n", | |
| " <th>cat2</th>\n", | |
| " <th>cat3</th>\n", | |
| " <th>cat4</th>\n", | |
| " <th>cat5</th>\n", | |
| " <th>cat6</th>\n", | |
| " <th>cat7</th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> a</td>\n", | |
| " <td> 0.171982</td>\n", | |
| " <td>-0.022455</td>\n", | |
| " <td> 0.668533</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> n</td>\n", | |
| " <td> y</td>\n", | |
| " <td> a</td>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> a</td>\n", | |
| " <td> 0.301511</td>\n", | |
| " <td> 0.119037</td>\n", | |
| " <td>-0.292068</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> y</td>\n", | |
| " <td> y</td>\n", | |
| " <td> a</td>\n", | |
| " <td> a</td>\n", | |
| " <td>-0.441025</td>\n", | |
| " <td> 1.052455</td>\n", | |
| " <td> 0.820292</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> c</td>\n", | |
| " <td> a</td>\n", | |
| " <td> 0.421350</td>\n", | |
| " <td> 0.223962</td>\n", | |
| " <td>-0.187951</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td> n</td>\n", | |
| " <td> n</td>\n", | |
| " <td> a</td>\n", | |
| " <td> y</td>\n", | |
| " <td> y</td>\n", | |
| " <td> a</td>\n", | |
| " <td> a</td>\n", | |
| " <td>-0.390083</td>\n", | |
| " <td> 0.556335</td>\n", | |
| " <td>-1.434217</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 8, | |
| "text": [ | |
| " cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n", | |
| "0 n n a n n a a 0.171982 -0.022455 0.668533\n", | |
| "1 n y a n n a a 0.301511 0.119037 -0.292068\n", | |
| "2 n n a y y a a -0.441025 1.052455 0.820292\n", | |
| "3 n n a n n c a 0.421350 0.223962 -0.187951\n", | |
| "4 n n a y y a a -0.390083 0.556335 -1.434217" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 8 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# We assign a numerical value to each of the categorical values\n", | |
| "test_data = test_data.replace(['a','b','c','d','n','y'], [1,2,3,4,5,6])\n", | |
| "\n", | |
| "test_data.head(10)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>cat1</th>\n", | |
| " <th>cat2</th>\n", | |
| " <th>cat3</th>\n", | |
| " <th>cat4</th>\n", | |
| " <th>cat5</th>\n", | |
| " <th>cat6</th>\n", | |
| " <th>cat7</th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0.171982</td>\n", | |
| " <td>-0.022455</td>\n", | |
| " <td> 0.668533</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0.301511</td>\n", | |
| " <td> 0.119037</td>\n", | |
| " <td>-0.292068</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>-0.441025</td>\n", | |
| " <td> 1.052455</td>\n", | |
| " <td> 0.820292</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0.421350</td>\n", | |
| " <td> 0.223962</td>\n", | |
| " <td>-0.187951</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>-0.390083</td>\n", | |
| " <td> 0.556335</td>\n", | |
| " <td>-1.434217</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>-0.207254</td>\n", | |
| " <td> 0.405312</td>\n", | |
| " <td> 0.185214</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 6</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0.320936</td>\n", | |
| " <td> 1.232641</td>\n", | |
| " <td>-0.661283</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>-0.718697</td>\n", | |
| " <td> 0.905296</td>\n", | |
| " <td> 0.838255</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 0.391449</td>\n", | |
| " <td> 0.013134</td>\n", | |
| " <td> 0.559273</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 5</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 2</td>\n", | |
| " <td> 1.173640</td>\n", | |
| " <td> 0.860782</td>\n", | |
| " <td>-1.237148</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 74, | |
| "text": [ | |
| " cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n", | |
| "0 5 5 1 5 5 1 1 0.171982 -0.022455 0.668533\n", | |
| "1 5 6 1 5 5 1 1 0.301511 0.119037 -0.292068\n", | |
| "2 5 5 1 6 6 1 1 -0.441025 1.052455 0.820292\n", | |
| "3 5 5 1 5 5 3 1 0.421350 0.223962 -0.187951\n", | |
| "4 5 5 1 6 6 1 1 -0.390083 0.556335 -1.434217\n", | |
| "5 5 5 1 6 6 2 1 -0.207254 0.405312 0.185214\n", | |
| "6 5 6 1 5 5 1 1 0.320936 1.232641 -0.661283\n", | |
| "7 5 5 1 5 5 1 1 -0.718697 0.905296 0.838255\n", | |
| "8 5 5 1 5 5 1 2 0.391449 0.013134 0.559273\n", | |
| "9 5 5 3 5 5 1 2 1.173640 0.860782 -1.237148" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 74 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 1, | |
| "metadata": {}, | |
| "source": [ | |
| "Testing different classification models" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Testing different classifiers from the scikit-learn libraries\n", | |
| "# Importing the different libraries\n", | |
| "\n", | |
| "from sklearn.ensemble import RandomForestClassifier\n", | |
| "from sklearn.neighbors import KNeighborsClassifier\n", | |
| "from sklearn.svm import SVC\n", | |
| "from sklearn.metrics import roc_auc_score" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 57 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# I split the training set into a sub training set(75%) et test set (25%)\n", | |
| "\n", | |
| "from sklearn.cross_validation import train_test_split\n", | |
| "x_train, x_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.25)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 33 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "1 - SVM Model" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# create and train a classifier\n", | |
| "SVM = SVC(gamma=0.001)\n", | |
| "\n", | |
| "# Fit the model\n", | |
| "SVM.fit(x_train, y_train)\n", | |
| "\n", | |
| "# Return the accuracy of the model \n", | |
| "accuracy = SVM.score(x_test, y_test)\n", | |
| "print \"The accuracy score for the SVM model is:\" \n", | |
| "print accuracy \n", | |
| "\n", | |
| "# Get the prediction\n", | |
| "preds = SVM.predict(x_test)\n", | |
| "\n", | |
| "# Return the ROC AUC score\n", | |
| "print \"The Area Under the Curve is:\" \n", | |
| "roc_auc_score(y_test, preds)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "The accuracy score for the SVM model is:\n", | |
| "0.6232\n", | |
| "The Area Under the Curve is:" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 67, | |
| "text": [ | |
| "0.62780636827753056" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 67 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "2 - Random Forest Model" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# create and train a classifier\n", | |
| "RandomForest = RandomForestClassifier()\n", | |
| "\n", | |
| "# Fit the model\n", | |
| "RandomForest.fit(x_train, y_train)\n", | |
| "\n", | |
| "# Return the accuracy of the model\n", | |
| "accuracy = RandomForest.score(x_test, y_test)\n", | |
| "print \"The accuracy of the Random Forest Model is:\"\n", | |
| "print accuracy\n", | |
| "\n", | |
| "# Get the predictions\n", | |
| "preds = RandomForest.predict(x_test)\n", | |
| "\n", | |
| "# Return the ROC AUC score\n", | |
| "print \"The Area Under the Curve is:\" \n", | |
| "roc_auc_score(y_test, preds)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "The accuracy of the Random Forest Model is:\n", | |
| "0.7992\n", | |
| "The Area Under the Curve is:\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 72, | |
| "text": [ | |
| "0.80056076107682528" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 72 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "3 - K-Nearest Neighor Model" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# create and train a classifier\n", | |
| "NearestNeighbor = KNeighborsClassifier()\n", | |
| "\n", | |
| "# Fit the model\n", | |
| "NearestNeighbor.fit(x_train, y_train)\n", | |
| "\n", | |
| "# Return the accuracy of the model \n", | |
| "accuracy = NearestNeighbor.score(x_test, y_test)\n", | |
| "print \"The accuracy of the K-Nearest Neighbor Model is:\"\n", | |
| "print accuracy\n", | |
| "\n", | |
| "# Get the predictions\n", | |
| "preds = NearestNeighbor.predict(x_test)\n", | |
| "\n", | |
| "# Return the ROC AUC score\n", | |
| "print \"The Area Under the Curve is:\" \n", | |
| "roc_auc_score(y_test, preds)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "The accuracy of the K-Nearest Neighbor Model is:\n", | |
| "0.7792\n", | |
| "The Area Under the Curve is:\n" | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 71, | |
| "text": [ | |
| "0.77948865150800661" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 71 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 1, | |
| "metadata": {}, | |
| "source": [ | |
| "Making predictions on the test set" | |
| ] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "The Random Forest model is the one providing the highest prediction accuracy. \n", | |
| "I will therefore use this model to make the predictions on the test set. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Predicitons made on the test set\n", | |
| "final_preds = RandomForest.predict(test_data)\n", | |
| "\n", | |
| "# Dumping the results into a text file\n", | |
| "np.savetxt('predictions_test_set.txt', final_preds, fmt='%i')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 84 | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment