sritee/kickstarter.ipynb

## kickstarter.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# <font color='black'>**Kickstarter Prediction Challenge - Feature Engineering Competition**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## This is my approach for a feature engineering competition at HackerEarth https://www.hackerearth.com/problem/machine-learning/funding-successful-projects/. This got us 23rd place on the final leaderboard among ~500 particpants. The dataset is available for download at the above link."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The train data has 108129 rows and 14 columns\n",
      "The test data has 63465 rows and 12 columns\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import numpy as np\n",
    "from sklearn import svm\n",
    "import xgboost as xgb\n",
    "from xgboost import XGBClassifier\n",
    "from currency_converter import CurrencyConverter\n",
    "from sklearn import metrics\n",
    "import time\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import xgboost\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from nltk.stem.snowball import SnowballStemmer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "c=CurrencyConverter()\n",
    "\n",
    "\n",
    "train = pd.read_csv(\"train.csv\")\n",
    "test = pd.read_csv(\"test.csv\")\n",
    "\n",
    "print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))\n",
    "print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Let us take a look at the features available to engineer further "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>project_id</th>\n",
       "      <th>name</th>\n",
       "      <th>desc</th>\n",
       "      <th>goal</th>\n",
       "      <th>keywords</th>\n",
       "      <th>disable_communication</th>\n",
       "      <th>country</th>\n",
       "      <th>currency</th>\n",
       "      <th>deadline</th>\n",
       "      <th>state_changed_at</th>\n",
       "      <th>created_at</th>\n",
       "      <th>launched_at</th>\n",
       "      <th>backers_count</th>\n",
       "      <th>final_status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>kkst1451568084</td>\n",
       "      <td>drawing for dollars</td>\n",
       "      <td>I like drawing pictures. and then i color them...</td>\n",
       "      <td>20.0</td>\n",
       "      <td>drawing-for-dollars</td>\n",
       "      <td>False</td>\n",
       "      <td>US</td>\n",
       "      <td>USD</td>\n",
       "      <td>1241333999</td>\n",
       "      <td>1241334017</td>\n",
       "      <td>1240600507</td>\n",
       "      <td>1240602723</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>kkst1474482071</td>\n",
       "      <td>Sponsor Dereck Blackburn (Lostwars) Artist in ...</td>\n",
       "      <td>I, Dereck Blackburn will be taking upon an inc...</td>\n",
       "      <td>300.0</td>\n",
       "      <td>sponsor-dereck-blackburn-lostwars-artist-in-re...</td>\n",
       "      <td>False</td>\n",
       "      <td>US</td>\n",
       "      <td>USD</td>\n",
       "      <td>1242429000</td>\n",
       "      <td>1242432018</td>\n",
       "      <td>1240960224</td>\n",
       "      <td>1240975592</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>kkst183622197</td>\n",
       "      <td>Mr. Squiggles</td>\n",
       "      <td>So I saw darkpony's successfully funded drawin...</td>\n",
       "      <td>30.0</td>\n",
       "      <td>mr-squiggles</td>\n",
       "      <td>False</td>\n",
       "      <td>US</td>\n",
       "      <td>USD</td>\n",
       "      <td>1243027560</td>\n",
       "      <td>1243027818</td>\n",
       "      <td>1242163613</td>\n",
       "      <td>1242164398</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>kkst597742710</td>\n",
       "      <td>Help me write my second novel.</td>\n",
       "      <td>Do your part to help out starving artists and ...</td>\n",
       "      <td>500.0</td>\n",
       "      <td>help-me-write-my-second-novel</td>\n",
       "      <td>False</td>\n",
       "      <td>US</td>\n",
       "      <td>USD</td>\n",
       "      <td>1243555740</td>\n",
       "      <td>1243556121</td>\n",
       "      <td>1240963795</td>\n",
       "      <td>1240966730</td>\n",
       "      <td>18</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>kkst1913131122</td>\n",
       "      <td>Support casting my sculpture in bronze</td>\n",
       "      <td>I'm nearing completion on a sculpture, current...</td>\n",
       "      <td>2000.0</td>\n",
       "      <td>support-casting-my-sculpture-in-bronze</td>\n",
       "      <td>False</td>\n",
       "      <td>US</td>\n",
       "      <td>USD</td>\n",
       "      <td>1243769880</td>\n",
       "      <td>1243770317</td>\n",
       "      <td>1241177914</td>\n",
       "      <td>1241180541</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       project_id                                               name  \\\n",
       "0  kkst1451568084                                drawing for dollars   \n",
       "1  kkst1474482071  Sponsor Dereck Blackburn (Lostwars) Artist in ...   \n",
       "2   kkst183622197                                      Mr. Squiggles   \n",
       "3   kkst597742710                     Help me write my second novel.   \n",
       "4  kkst1913131122             Support casting my sculpture in bronze   \n",
       "\n",
       "                                                desc    goal  \\\n",
       "0  I like drawing pictures. and then i color them...    20.0   \n",
       "1  I, Dereck Blackburn will be taking upon an inc...   300.0   \n",
       "2  So I saw darkpony's successfully funded drawin...    30.0   \n",
       "3  Do your part to help out starving artists and ...   500.0   \n",
       "4  I'm nearing completion on a sculpture, current...  2000.0   \n",
       "\n",
       "                                            keywords  disable_communication  \\\n",
       "0                                drawing-for-dollars                  False   \n",
       "1  sponsor-dereck-blackburn-lostwars-artist-in-re...                  False   \n",
       "2                                       mr-squiggles                  False   \n",
       "3                      help-me-write-my-second-novel                  False   \n",
       "4             support-casting-my-sculpture-in-bronze                  False   \n",
       "\n",
       "  country currency    deadline  state_changed_at  created_at  launched_at  \\\n",
       "0      US      USD  1241333999        1241334017  1240600507   1240602723   \n",
       "1      US      USD  1242429000        1242432018  1240960224   1240975592   \n",
       "2      US      USD  1243027560        1243027818  1242163613   1242164398   \n",
       "3      US      USD  1243555740        1243556121  1240963795   1240966730   \n",
       "4      US      USD  1243769880        1243770317  1241177914   1241180541   \n",
       "\n",
       "   backers_count  final_status  \n",
       "0              3             1  \n",
       "1              2             0  \n",
       "2              0             0  \n",
       "3             18             1  \n",
       "4              1             0  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The dataset contains fields like - Name of the project, Description, The goal amount etc as seen above. We will extract text features as well as normalized time features from the given variables. First we extract time features like duration of project, time project changed after launched."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train['time_project']=train['deadline']-train['launched_at']\n",
    "train['time_deadline_change']=train['state_changed_at']-train['deadline']\n",
    "train['time_launched_change']=train['state_changed_at']-train['launched_at']\n",
    "train['time_launched_change']=train['time_launched_change']/1e4\n",
    "train['time_created_change']=train['state_changed_at']-train['created_at']\n",
    "train['time_created_change']=train['time_created_change']/1e4\n",
    "train['time_project']=train['time_project']/1e4\n",
    "train['cldiff']=train['created_at']-train['launched_at']\n",
    "train['cldiff']=train['cldiff']/1e3\n",
    "train['time_deadline_change_log']=train['time_deadline_change'].apply(lambda c: np.log(c))\n",
    "train['time_launched_change_log']=train['time_launched_change'].apply(lambda c: np.log(c))\n",
    "train['time_created_change_log']=train['time_created_change'].apply(lambda c: np.log(c))\n",
    "train['time_project_log']=train['time_project'].apply(lambda c: np.log(c))\n",
    "train['year'] = train['deadline'].apply(lambda k: int(time.ctime(k)[-4:])-2009)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test['year'] = test['deadline'].apply(lambda k: int(time.ctime(k)[-4:])-2009)\n",
    "\n",
    "test['time_project']=test['deadline']-test['launched_at']\n",
    "test['time_deadline_change']=test['state_changed_at']-test['deadline']\n",
    "test['time_launched_change']=test['state_changed_at']-test['launched_at']\n",
    "test['time_launched_change']=test['time_launched_change']/1e4\n",
    "test['time_project']=test['time_project']/1e4\n",
    "test['time_created_change']=test['state_changed_at']-test['created_at']\n",
    "test['time_created_change']=test['time_created_change']/1e4\n",
    "test['cldiff']=test['created_at']-test['launched_at']\n",
    "test['time_deadline_change_log']=test['time_deadline_change'].apply(lambda c: np.log(c))\n",
    "test['time_launched_change_log']=test['time_launched_change'].apply(lambda c: np.log(c))\n",
    "test['time_created_change_log']=test['time_created_change'].apply(lambda c: np.log(c))\n",
    "test['time_project_log']=test['time_project'].apply(lambda c: np.log(c))\n",
    "test['cldiff']=test['cldiff']/1e3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train['month'] = train['deadline'].apply(lambda k: time.ctime(k).split()[1])\n",
    "test['month'] = test['deadline'].apply(lambda k: time.ctime(k).split()[1])\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## We now compute features like goal per unit description length, normalized project goal amount etc. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cols_to_use = ['name','desc']\n",
    "len_feats = ['name_len','desc_len']\n",
    "count_feats = ['name_count','desc_count']\n",
    "\n",
    "for i in np.arange(2):\n",
    "    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)\n",
    "    train[count_feats[i]] = train[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))\n",
    "\n",
    "train['keywords_len'] = train['keywords'].apply(str).apply(len)\n",
    "train['keywords_count'] = train['keywords'].apply(str).apply(lambda x: len(x.split('-')))\n",
    "\n",
    "for i in np.arange(2):\n",
    "    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)\n",
    "    test[count_feats[i]] = test[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))\n",
    "    \n",
    "test['keywords_len'] = test['keywords'].apply(str).apply(len)\n",
    "test['keywords_count'] = test['keywords'].apply(str).apply(lambda x: len(x.split('-')))\n",
    "\n",
    "\n",
    "# In[921]:\n",
    "\n",
    "train['keywords_count_norm']=train['keywords_count']-train['keywords_count'].median()\n",
    "\n",
    "train['name_count_norm']=train['name_count']-train['name_count'].median()\n",
    "\n",
    "train['desc_count_norm']=train['desc_count']-train['desc_count'].median()\n",
    "\n",
    "test['keywords_count_norm']=test['keywords_count']-test['keywords_count'].median()\n",
    "\n",
    "test['name_count_norm']=test['name_count']-test['name_count'].median()\n",
    "\n",
    "test['desc_count_norm']=test['desc_count']-test['desc_count'].median()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# In[922]:\n",
    "\n",
    "train['title_word_len']=train[['name_len','name_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
    "\n",
    "test['title_word_len']=test[['name_len','name_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
    "\n",
    "\n",
    "# In[923]:\n",
    "\n",
    "train['keyword_word_len']=train[['keywords_len','keywords_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
    "\n",
    "test['keyword_word_len']=test[['keywords_len','keywords_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
    "\n",
    "\n",
    "# In[924]:\n",
    "\n",
    "train['desc_word_len']=train[['desc_len','desc_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
    "\n",
    "test['desc_word_len']=test[['desc_len','desc_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
    "\n",
    "\n",
    "# In[925]:\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "feat = ['disable_communication','country','currency']\n",
    "for x in feat:\n",
    "    \n",
    "    le = LabelEncoder()\n",
    "    le.fit(list(train[x].values) + list(test[x].values))\n",
    "    train[x] = le.transform(list(train[x]))\n",
    "    test[x] = le.transform(list(test[x].values))\n",
    "    \n",
    "\n",
    "\n",
    "# In[926]:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train['goal_unnormalized'] = train[['goal','currency']].apply(lambda x: c.convert(x[0],le.inverse_transform(int(x[1])),'USD'),axis=1)\n",
    "test['goal_unnormalized'] = test[['goal','currency']].apply(lambda x: c.convert(x[0],le.inverse_transform(int(x[1])),'USD'),axis=1)\n",
    "\n",
    "train['goal_norm']=train['goal_unnormalized'] -train['goal_unnormalized'].median()\n",
    "test['goal_norm']=test['goal_unnormalized'] -test['goal_unnormalized'].median()\n",
    "\n",
    "train['goal_per_day']=train[['goal','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e1,axis=1)\n",
    "\n",
    "test['goal_per_day']=test[['goal','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e1,axis=1)\n",
    "\n",
    "\n",
    "# In[959]:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train['goalper_descriptionlength']=train[['goal','desc_count']].apply(lambda x: (x[0]/(x[1]+2)),axis=1)\n",
    "\n",
    "test['goalper_descriptionlength']=test[['goal','desc_count']].apply(lambda x: (x[0]/(x[1]+2)),axis=1)\n",
    "\n",
    "\n",
    "# In[928]:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train['state_change_norm']=train[['time_deadline_change','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e4,axis=1)\n",
    "test['state_change_norm']=test[['time_deadline_change','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e4,axis=1)\n",
    "\n",
    "\n",
    "train['is_12_currency']=train['currency'].apply(lambda x: int(x==12))\n",
    "test['is_12_currency']=test['currency'].apply(lambda x: int(x==12))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  We extract text features after cleaning out the stopwords from the data. We use normalized count features of the description length. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# creating a full list of descriptions from train and etst\n",
    "kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)\n",
    "\n",
    "# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower\n",
    "def desc_clean(word):\n",
    "    p1 = re.sub(pattern='(\\W+)|(\\d+)|(\\s+)',repl=' ',string=word)\n",
    "    p1 = p1.lower()\n",
    "    return p1\n",
    "\n",
    "kickdesc = kickdesc.map(desc_clean)\n",
    "\n",
    "stop = set(stopwords.words('english'))\n",
    "kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]\n",
    "\n",
    "stemmer = SnowballStemmer(language='english')\n",
    "kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]\n",
    "\n",
    "kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]\n",
    "\n",
    "kickdesc = [' '.join(x) for x in kickdesc]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cv = TfidfVectorizer(max_features=375)\n",
    "\n",
    "alldesc = cv.fit_transform(kickdesc).todense()\n",
    "\n",
    "#create a data frame\n",
    "combine = pd.DataFrame(alldesc)\n",
    "combine.rename(columns= lambda x: 'descvariable_'+ str(x), inplace=True)\n",
    "\n",
    "#split the text features\n",
    "\n",
    "train_text = combine[:train.shape[0]]\n",
    "test_text = combine[train.shape[0]:]\n",
    "\n",
    "test_text.reset_index(drop=True,inplace=True)\n",
    "\n",
    "\n",
    "#test_text.reset_index(drop=True,inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  We select the subset of features needed for training data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cols_to_use = ['name_count_norm'\n",
    "                 ,'desc_count_norm'\n",
    "                 ,'time_project'\n",
    "                ,'time_deadline_change'\n",
    "                  ,'time_launched_change'\n",
    "               ,'disable_communication'\n",
    "               ,'goal_per_day'\n",
    "               ,'is_12_currency'\n",
    "               ,'cldiff'\n",
    "               ,'title_word_len'\n",
    "               ,'keyword_word_len'\n",
    "               ,'desc_word_len',\n",
    "               'goalper_descriptionlength'\n",
    "              ]\n",
    "\n",
    "target = train['final_status']\n",
    "\n",
    "# data for modeling\n",
    "k_train = train[cols_to_use]\n",
    "k_test=test[cols_to_use]\n",
    "\n",
    "\n",
    "# In[943]:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(63465, 388)\n"
     ]
    }
   ],
   "source": [
    "X_train = pd.concat([k_train,train_text],axis=1)\n",
    "X_test=pd.concat([k_test,test_text],axis=1)\n",
    "print(X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.cross_validation import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Training Data and Validation Data split "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_tr,X_val,y_tr,y_val=train_test_split(X_train,target,test_size=0.2,random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "dTrain = xgb.DMatrix(X_train,target)\n",
    "#dVal=   xgb.DMatrix(X_val,y_val)\n",
    "dTest  = xgb.DMatrix(X_test)\n",
    "\n",
    "xgb_params = {\n",
    "   'objective': 'binary:logistic',\n",
    "    'booster': 'gbtree',\n",
    "    'eval_metric': 'logloss',\n",
    "    'eta': 0.025, \n",
    "    'max_depth': 7,\n",
    "    'lambda': 2,\n",
    "    'alpha': 0.02,\n",
    "    'subsample': 0.8,\n",
    "    'colsample_bytree': 0.8,\n",
    "    'min_child_weight': 1,\n",
    "    'silent': 1}\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Let us train a XGBoost Model using our features "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-logloss:0.68771\n",
      "Will train until train-logloss hasn't improved in 40 rounds.\n",
      "[50]\ttrain-logloss:0.574593\n",
      "[100]\ttrain-logloss:0.548285\n",
      "[150]\ttrain-logloss:0.53696\n",
      "[200]\ttrain-logloss:0.52961\n",
      "[250]\ttrain-logloss:0.523443\n",
      "[300]\ttrain-logloss:0.518137\n",
      "[350]\ttrain-logloss:0.513683\n",
      "[400]\ttrain-logloss:0.509385\n",
      "[450]\ttrain-logloss:0.505515\n",
      "[500]\ttrain-logloss:0.502177\n",
      "[550]\ttrain-logloss:0.498884\n",
      "[600]\ttrain-logloss:0.495757\n",
      "[650]\ttrain-logloss:0.492827\n",
      "[700]\ttrain-logloss:0.489998\n",
      "[750]\ttrain-logloss:0.487428\n",
      "[800]\ttrain-logloss:0.484896\n",
      "[850]\ttrain-logloss:0.482488\n",
      "[900]\ttrain-logloss:0.480101\n",
      "[950]\ttrain-logloss:0.477878\n",
      "[1000]\ttrain-logloss:0.475649\n",
      "[1050]\ttrain-logloss:0.47348\n",
      "[1100]\ttrain-logloss:0.471171\n",
      "[1150]\ttrain-logloss:0.469225\n",
      "[1200]\ttrain-logloss:0.46729\n",
      "[1250]\ttrain-logloss:0.465523\n"
     ]
    }
   ],
   "source": [
    "bst_train=xgb.train(xgb_params, dTrain, 1300,  [(dTrain,'train')],\n",
    "               verbose_eval=50, early_stopping_rounds=40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pred = bst_train.predict(dTest, ntree_limit=bst_train.best_ntree_limit)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  We now write our predictions to a file and submit "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# In[970]:\n",
    "\n",
    "### write the file\n",
    "result=[]\n",
    "for idx,k in enumerate(pred.tolist()):\n",
    "    if k>0.4:\n",
    "        result.append(1)\n",
    "    else:\n",
    "        result.append(0)\n",
    "nBsub = pd.DataFrame({'project_id':test['project_id'],'final_status':result})\n",
    "nBsub = nBsub[['project_id','final_status']]\n",
    "nBsub.to_csv(\"nBstarter.csv\",index = False) #0.6526"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# <font color='black'>Kickstarter Prediction Challenge - Feature Engineering Competition"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## This is my approach for a feature engineering competition at HackerEarth https://www.hackerearth.com/problem/machine-learning/funding-successful-projects/. This got us 23rd place on the final leaderboard among ~500 particpants. The dataset is available for download at the above link."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The train data has 108129 rows and 14 columns\n",
	"The test data has 63465 rows and 12 columns\n"
	]
	}
	],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import numpy as np\n",
	"from sklearn import svm\n",
	"import xgboost as xgb\n",
	"from xgboost import XGBClassifier\n",
	"from currency_converter import CurrencyConverter\n",
	"from sklearn import metrics\n",
	"import time\n",
	"from sklearn.cross_validation import train_test_split\n",
	"import xgboost\n",
	"from xgboost import XGBClassifier\n",
	"from sklearn.preprocessing import MinMaxScaler\n",
	"from nltk.corpus import stopwords\n",
	"from sklearn.preprocessing import LabelEncoder\n",
	"from nltk.stem.snowball import SnowballStemmer\n",
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"c=CurrencyConverter()\n",
	"\n",
	"\n",
	"train = pd.read_csv(\"train.csv\")\n",
	"test = pd.read_csv(\"test.csv\")\n",
	"\n",
	"print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))\n",
	"print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Let us take a look at the features available to engineer further "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>project_id</th>\n",
	" <th>name</th>\n",
	" <th>desc</th>\n",
	" <th>goal</th>\n",
	" <th>keywords</th>\n",
	" <th>disable_communication</th>\n",
	" <th>country</th>\n",
	" <th>currency</th>\n",
	" <th>deadline</th>\n",
	" <th>state_changed_at</th>\n",
	" <th>created_at</th>\n",
	" <th>launched_at</th>\n",
	" <th>backers_count</th>\n",
	" <th>final_status</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>kkst1451568084</td>\n",
	" <td>drawing for dollars</td>\n",
	" <td>I like drawing pictures. and then i color them...</td>\n",
	" <td>20.0</td>\n",
	" <td>drawing-for-dollars</td>\n",
	" <td>False</td>\n",
	" <td>US</td>\n",
	" <td>USD</td>\n",
	" <td>1241333999</td>\n",
	" <td>1241334017</td>\n",
	" <td>1240600507</td>\n",
	" <td>1240602723</td>\n",
	" <td>3</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>kkst1474482071</td>\n",
	" <td>Sponsor Dereck Blackburn (Lostwars) Artist in ...</td>\n",
	" <td>I, Dereck Blackburn will be taking upon an inc...</td>\n",
	" <td>300.0</td>\n",
	" <td>sponsor-dereck-blackburn-lostwars-artist-in-re...</td>\n",
	" <td>False</td>\n",
	" <td>US</td>\n",
	" <td>USD</td>\n",
	" <td>1242429000</td>\n",
	" <td>1242432018</td>\n",
	" <td>1240960224</td>\n",
	" <td>1240975592</td>\n",
	" <td>2</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>kkst183622197</td>\n",
	" <td>Mr. Squiggles</td>\n",
	" <td>So I saw darkpony's successfully funded drawin...</td>\n",
	" <td>30.0</td>\n",
	" <td>mr-squiggles</td>\n",
	" <td>False</td>\n",
	" <td>US</td>\n",
	" <td>USD</td>\n",
	" <td>1243027560</td>\n",
	" <td>1243027818</td>\n",
	" <td>1242163613</td>\n",
	" <td>1242164398</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>kkst597742710</td>\n",
	" <td>Help me write my second novel.</td>\n",
	" <td>Do your part to help out starving artists and ...</td>\n",
	" <td>500.0</td>\n",
	" <td>help-me-write-my-second-novel</td>\n",
	" <td>False</td>\n",
	" <td>US</td>\n",
	" <td>USD</td>\n",
	" <td>1243555740</td>\n",
	" <td>1243556121</td>\n",
	" <td>1240963795</td>\n",
	" <td>1240966730</td>\n",
	" <td>18</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>kkst1913131122</td>\n",
	" <td>Support casting my sculpture in bronze</td>\n",
	" <td>I'm nearing completion on a sculpture, current...</td>\n",
	" <td>2000.0</td>\n",
	" <td>support-casting-my-sculpture-in-bronze</td>\n",
	" <td>False</td>\n",
	" <td>US</td>\n",
	" <td>USD</td>\n",
	" <td>1243769880</td>\n",
	" <td>1243770317</td>\n",
	" <td>1241177914</td>\n",
	" <td>1241180541</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" project_id name \\\n",
	"0 kkst1451568084 drawing for dollars \n",
	"1 kkst1474482071 Sponsor Dereck Blackburn (Lostwars) Artist in ... \n",
	"2 kkst183622197 Mr. Squiggles \n",
	"3 kkst597742710 Help me write my second novel. \n",
	"4 kkst1913131122 Support casting my sculpture in bronze \n",
	"\n",
	" desc goal \\\n",
	"0 I like drawing pictures. and then i color them... 20.0 \n",
	"1 I, Dereck Blackburn will be taking upon an inc... 300.0 \n",
	"2 So I saw darkpony's successfully funded drawin... 30.0 \n",
	"3 Do your part to help out starving artists and ... 500.0 \n",
	"4 I'm nearing completion on a sculpture, current... 2000.0 \n",
	"\n",
	" keywords disable_communication \\\n",
	"0 drawing-for-dollars False \n",
	"1 sponsor-dereck-blackburn-lostwars-artist-in-re... False \n",
	"2 mr-squiggles False \n",
	"3 help-me-write-my-second-novel False \n",
	"4 support-casting-my-sculpture-in-bronze False \n",
	"\n",
	" country currency deadline state_changed_at created_at launched_at \\\n",
	"0 US USD 1241333999 1241334017 1240600507 1240602723 \n",
	"1 US USD 1242429000 1242432018 1240960224 1240975592 \n",
	"2 US USD 1243027560 1243027818 1242163613 1242164398 \n",
	"3 US USD 1243555740 1243556121 1240963795 1240966730 \n",
	"4 US USD 1243769880 1243770317 1241177914 1241180541 \n",
	"\n",
	" backers_count final_status \n",
	"0 3 1 \n",
	"1 2 0 \n",
	"2 0 0 \n",
	"3 18 1 \n",
	"4 1 0 "
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"train.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## The dataset contains fields like - Name of the project, Description, The goal amount etc as seen above. We will extract text features as well as normalized time features from the given variables. First we extract time features like duration of project, time project changed after launched."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"train['time_project']=train['deadline']-train['launched_at']\n",
	"train['time_deadline_change']=train['state_changed_at']-train['deadline']\n",
	"train['time_launched_change']=train['state_changed_at']-train['launched_at']\n",
	"train['time_launched_change']=train['time_launched_change']/1e4\n",
	"train['time_created_change']=train['state_changed_at']-train['created_at']\n",
	"train['time_created_change']=train['time_created_change']/1e4\n",
	"train['time_project']=train['time_project']/1e4\n",
	"train['cldiff']=train['created_at']-train['launched_at']\n",
	"train['cldiff']=train['cldiff']/1e3\n",
	"train['time_deadline_change_log']=train['time_deadline_change'].apply(lambda c: np.log(c))\n",
	"train['time_launched_change_log']=train['time_launched_change'].apply(lambda c: np.log(c))\n",
	"train['time_created_change_log']=train['time_created_change'].apply(lambda c: np.log(c))\n",
	"train['time_project_log']=train['time_project'].apply(lambda c: np.log(c))\n",
	"train['year'] = train['deadline'].apply(lambda k: int(time.ctime(k)[-4:])-2009)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"test['year'] = test['deadline'].apply(lambda k: int(time.ctime(k)[-4:])-2009)\n",
	"\n",
	"test['time_project']=test['deadline']-test['launched_at']\n",
	"test['time_deadline_change']=test['state_changed_at']-test['deadline']\n",
	"test['time_launched_change']=test['state_changed_at']-test['launched_at']\n",
	"test['time_launched_change']=test['time_launched_change']/1e4\n",
	"test['time_project']=test['time_project']/1e4\n",
	"test['time_created_change']=test['state_changed_at']-test['created_at']\n",
	"test['time_created_change']=test['time_created_change']/1e4\n",
	"test['cldiff']=test['created_at']-test['launched_at']\n",
	"test['time_deadline_change_log']=test['time_deadline_change'].apply(lambda c: np.log(c))\n",
	"test['time_launched_change_log']=test['time_launched_change'].apply(lambda c: np.log(c))\n",
	"test['time_created_change_log']=test['time_created_change'].apply(lambda c: np.log(c))\n",
	"test['time_project_log']=test['time_project'].apply(lambda c: np.log(c))\n",
	"test['cldiff']=test['cldiff']/1e3"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"train['month'] = train['deadline'].apply(lambda k: time.ctime(k).split()[1])\n",
	"test['month'] = test['deadline'].apply(lambda k: time.ctime(k).split()[1])\n",
	"\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## We now compute features like goal per unit description length, normalized project goal amount etc. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"cols_to_use = ['name','desc']\n",
	"len_feats = ['name_len','desc_len']\n",
	"count_feats = ['name_count','desc_count']\n",
	"\n",
	"for i in np.arange(2):\n",
	" train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)\n",
	" train[count_feats[i]] = train[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))\n",
	"\n",
	"train['keywords_len'] = train['keywords'].apply(str).apply(len)\n",
	"train['keywords_count'] = train['keywords'].apply(str).apply(lambda x: len(x.split('-')))\n",
	"\n",
	"for i in np.arange(2):\n",
	" test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)\n",
	" test[count_feats[i]] = test[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))\n",
	" \n",
	"test['keywords_len'] = test['keywords'].apply(str).apply(len)\n",
	"test['keywords_count'] = test['keywords'].apply(str).apply(lambda x: len(x.split('-')))\n",
	"\n",
	"\n",
	"# In[921]:\n",
	"\n",
	"train['keywords_count_norm']=train['keywords_count']-train['keywords_count'].median()\n",
	"\n",
	"train['name_count_norm']=train['name_count']-train['name_count'].median()\n",
	"\n",
	"train['desc_count_norm']=train['desc_count']-train['desc_count'].median()\n",
	"\n",
	"test['keywords_count_norm']=test['keywords_count']-test['keywords_count'].median()\n",
	"\n",
	"test['name_count_norm']=test['name_count']-test['name_count'].median()\n",
	"\n",
	"test['desc_count_norm']=test['desc_count']-test['desc_count'].median()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# In[922]:\n",
	"\n",
	"train['title_word_len']=train[['name_len','name_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
	"\n",
	"test['title_word_len']=test[['name_len','name_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
	"\n",
	"\n",
	"# In[923]:\n",
	"\n",
	"train['keyword_word_len']=train[['keywords_len','keywords_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
	"\n",
	"test['keyword_word_len']=test[['keywords_len','keywords_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
	"\n",
	"\n",
	"# In[924]:\n",
	"\n",
	"train['desc_word_len']=train[['desc_len','desc_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
	"\n",
	"test['desc_word_len']=test[['desc_len','desc_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
	"\n",
	"\n",
	"# In[925]:\n",
	"\n",
	"from sklearn.preprocessing import LabelEncoder\n",
	"\n",
	"feat = ['disable_communication','country','currency']\n",
	"for x in feat:\n",
	" \n",
	" le = LabelEncoder()\n",
	" le.fit(list(train[x].values) + list(test[x].values))\n",
	" train[x] = le.transform(list(train[x]))\n",
	" test[x] = le.transform(list(test[x].values))\n",
	" \n",
	"\n",
	"\n",
	"# In[926]:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"train['goal_unnormalized'] = train[['goal','currency']].apply(lambda x: c.convert(x[0],le.inverse_transform(int(x[1])),'USD'),axis=1)\n",
	"test['goal_unnormalized'] = test[['goal','currency']].apply(lambda x: c.convert(x[0],le.inverse_transform(int(x[1])),'USD'),axis=1)\n",
	"\n",
	"train['goal_norm']=train['goal_unnormalized'] -train['goal_unnormalized'].median()\n",
	"test['goal_norm']=test['goal_unnormalized'] -test['goal_unnormalized'].median()\n",
	"\n",
	"train['goal_per_day']=train[['goal','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e1,axis=1)\n",
	"\n",
	"test['goal_per_day']=test[['goal','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e1,axis=1)\n",
	"\n",
	"\n",
	"# In[959]:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"train['goalper_descriptionlength']=train[['goal','desc_count']].apply(lambda x: (x[0]/(x[1]+2)),axis=1)\n",
	"\n",
	"test['goalper_descriptionlength']=test[['goal','desc_count']].apply(lambda x: (x[0]/(x[1]+2)),axis=1)\n",
	"\n",
	"\n",
	"# In[928]:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"train['state_change_norm']=train[['time_deadline_change','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e4,axis=1)\n",
	"test['state_change_norm']=test[['time_deadline_change','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e4,axis=1)\n",
	"\n",
	"\n",
	"train['is_12_currency']=train['currency'].apply(lambda x: int(x==12))\n",
	"test['is_12_currency']=test['currency'].apply(lambda x: int(x==12))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## We extract text features after cleaning out the stopwords from the data. We use normalized count features of the description length. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import re\n",
	"\n",
	"# creating a full list of descriptions from train and etst\n",
	"kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)\n",
	"\n",
	"# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower\n",
	"def desc_clean(word):\n",
	" p1 = re.sub(pattern='(\\W+)\|(\\d+)\|(\\s+)',repl=' ',string=word)\n",
	" p1 = p1.lower()\n",
	" return p1\n",
	"\n",
	"kickdesc = kickdesc.map(desc_clean)\n",
	"\n",
	"stop = set(stopwords.words('english'))\n",
	"kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]\n",
	"\n",
	"stemmer = SnowballStemmer(language='english')\n",
	"kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]\n",
	"\n",
	"kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]\n",
	"\n",
	"kickdesc = [' '.join(x) for x in kickdesc]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"cv = TfidfVectorizer(max_features=375)\n",
	"\n",
	"alldesc = cv.fit_transform(kickdesc).todense()\n",
	"\n",
	"#create a data frame\n",
	"combine = pd.DataFrame(alldesc)\n",
	"combine.rename(columns= lambda x: 'descvariable_'+ str(x), inplace=True)\n",
	"\n",
	"#split the text features\n",
	"\n",
	"train_text = combine[:train.shape[0]]\n",
	"test_text = combine[train.shape[0]:]\n",
	"\n",
	"test_text.reset_index(drop=True,inplace=True)\n",
	"\n",
	"\n",
	"#test_text.reset_index(drop=True,inplace=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## We select the subset of features needed for training data "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"cols_to_use = ['name_count_norm'\n",
	" ,'desc_count_norm'\n",
	" ,'time_project'\n",
	" ,'time_deadline_change'\n",
	" ,'time_launched_change'\n",
	" ,'disable_communication'\n",
	" ,'goal_per_day'\n",
	" ,'is_12_currency'\n",
	" ,'cldiff'\n",
	" ,'title_word_len'\n",
	" ,'keyword_word_len'\n",
	" ,'desc_word_len',\n",
	" 'goalper_descriptionlength'\n",
	" ]\n",
	"\n",
	"target = train['final_status']\n",
	"\n",
	"# data for modeling\n",
	"k_train = train[cols_to_use]\n",
	"k_test=test[cols_to_use]\n",
	"\n",
	"\n",
	"# In[943]:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(63465, 388)\n"
	]
	}
	],
	"source": [
	"X_train = pd.concat([k_train,train_text],axis=1)\n",
	"X_test=pd.concat([k_test,test_text],axis=1)\n",
	"print(X_test.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.cross_validation import train_test_split"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Training Data and Validation Data split "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X_tr,X_val,y_tr,y_val=train_test_split(X_train,target,test_size=0.2,random_state=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"\n",
	"\n",
	"dTrain = xgb.DMatrix(X_train,target)\n",
	"#dVal= xgb.DMatrix(X_val,y_val)\n",
	"dTest = xgb.DMatrix(X_test)\n",
	"\n",
	"xgb_params = {\n",
	" 'objective': 'binary:logistic',\n",
	" 'booster': 'gbtree',\n",
	" 'eval_metric': 'logloss',\n",
	" 'eta': 0.025, \n",
	" 'max_depth': 7,\n",
	" 'lambda': 2,\n",
	" 'alpha': 0.02,\n",
	" 'subsample': 0.8,\n",
	" 'colsample_bytree': 0.8,\n",
	" 'min_child_weight': 1,\n",
	" 'silent': 1}\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Let us train a XGBoost Model using our features "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[0]\ttrain-logloss:0.68771\n",
	"Will train until train-logloss hasn't improved in 40 rounds.\n",
	"[50]\ttrain-logloss:0.574593\n",
	"[100]\ttrain-logloss:0.548285\n",
	"[150]\ttrain-logloss:0.53696\n",
	"[200]\ttrain-logloss:0.52961\n",
	"[250]\ttrain-logloss:0.523443\n",
	"[300]\ttrain-logloss:0.518137\n",
	"[350]\ttrain-logloss:0.513683\n",
	"[400]\ttrain-logloss:0.509385\n",
	"[450]\ttrain-logloss:0.505515\n",
	"[500]\ttrain-logloss:0.502177\n",
	"[550]\ttrain-logloss:0.498884\n",
	"[600]\ttrain-logloss:0.495757\n",
	"[650]\ttrain-logloss:0.492827\n",
	"[700]\ttrain-logloss:0.489998\n",
	"[750]\ttrain-logloss:0.487428\n",
	"[800]\ttrain-logloss:0.484896\n",
	"[850]\ttrain-logloss:0.482488\n",
	"[900]\ttrain-logloss:0.480101\n",
	"[950]\ttrain-logloss:0.477878\n",
	"[1000]\ttrain-logloss:0.475649\n",
	"[1050]\ttrain-logloss:0.47348\n",
	"[1100]\ttrain-logloss:0.471171\n",
	"[1150]\ttrain-logloss:0.469225\n",
	"[1200]\ttrain-logloss:0.46729\n",
	"[1250]\ttrain-logloss:0.465523\n"
	]
	}
	],
	"source": [
	"bst_train=xgb.train(xgb_params, dTrain, 1300, [(dTrain,'train')],\n",
	" verbose_eval=50, early_stopping_rounds=40)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"pred = bst_train.predict(dTest, ntree_limit=bst_train.best_ntree_limit)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## We now write our predictions to a file and submit "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# In[970]:\n",
	"\n",
	"### write the file\n",
	"result=[]\n",
	"for idx,k in enumerate(pred.tolist()):\n",
	" if k>0.4:\n",
	" result.append(1)\n",
	" else:\n",
	" result.append(0)\n",
	"nBsub = pd.DataFrame({'project_id':test['project_id'],'final_status':result})\n",
	"nBsub = nBsub[['project_id','final_status']]\n",
	"nBsub.to_csv(\"nBstarter.csv\",index = False) #0.6526"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
	}
No results found