Skip to content

Instantly share code, notes, and snippets.

@sritee
Last active August 4, 2017 08:42
Show Gist options
  • Select an option

  • Save sritee/f7eb1728f660b010e1c5497173926cbc to your computer and use it in GitHub Desktop.

Select an option

Save sritee/f7eb1728f660b010e1c5497173926cbc to your computer and use it in GitHub Desktop.
Kickstarter Success Prediction Challenge - Feature Engineering Competition
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# <font color='black'>**Kickstarter Prediction Challenge - Feature Engineering Competition**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## This is my approach for a feature engineering competition at HackerEarth https://www.hackerearth.com/problem/machine-learning/funding-successful-projects/. This got us 23rd place on the final leaderboard among ~500 particpants. The dataset is available for download at the above link."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The train data has 108129 rows and 14 columns\n",
"The test data has 63465 rows and 12 columns\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import numpy as np\n",
"from sklearn import svm\n",
"import xgboost as xgb\n",
"from xgboost import XGBClassifier\n",
"from currency_converter import CurrencyConverter\n",
"from sklearn import metrics\n",
"import time\n",
"from sklearn.cross_validation import train_test_split\n",
"import xgboost\n",
"from xgboost import XGBClassifier\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from nltk.corpus import stopwords\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from nltk.stem.snowball import SnowballStemmer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"c=CurrencyConverter()\n",
"\n",
"\n",
"train = pd.read_csv(\"train.csv\")\n",
"test = pd.read_csv(\"test.csv\")\n",
"\n",
"print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))\n",
"print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Let us take a look at the features available to engineer further "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>project_id</th>\n",
" <th>name</th>\n",
" <th>desc</th>\n",
" <th>goal</th>\n",
" <th>keywords</th>\n",
" <th>disable_communication</th>\n",
" <th>country</th>\n",
" <th>currency</th>\n",
" <th>deadline</th>\n",
" <th>state_changed_at</th>\n",
" <th>created_at</th>\n",
" <th>launched_at</th>\n",
" <th>backers_count</th>\n",
" <th>final_status</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>kkst1451568084</td>\n",
" <td>drawing for dollars</td>\n",
" <td>I like drawing pictures. and then i color them...</td>\n",
" <td>20.0</td>\n",
" <td>drawing-for-dollars</td>\n",
" <td>False</td>\n",
" <td>US</td>\n",
" <td>USD</td>\n",
" <td>1241333999</td>\n",
" <td>1241334017</td>\n",
" <td>1240600507</td>\n",
" <td>1240602723</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>kkst1474482071</td>\n",
" <td>Sponsor Dereck Blackburn (Lostwars) Artist in ...</td>\n",
" <td>I, Dereck Blackburn will be taking upon an inc...</td>\n",
" <td>300.0</td>\n",
" <td>sponsor-dereck-blackburn-lostwars-artist-in-re...</td>\n",
" <td>False</td>\n",
" <td>US</td>\n",
" <td>USD</td>\n",
" <td>1242429000</td>\n",
" <td>1242432018</td>\n",
" <td>1240960224</td>\n",
" <td>1240975592</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>kkst183622197</td>\n",
" <td>Mr. Squiggles</td>\n",
" <td>So I saw darkpony's successfully funded drawin...</td>\n",
" <td>30.0</td>\n",
" <td>mr-squiggles</td>\n",
" <td>False</td>\n",
" <td>US</td>\n",
" <td>USD</td>\n",
" <td>1243027560</td>\n",
" <td>1243027818</td>\n",
" <td>1242163613</td>\n",
" <td>1242164398</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>kkst597742710</td>\n",
" <td>Help me write my second novel.</td>\n",
" <td>Do your part to help out starving artists and ...</td>\n",
" <td>500.0</td>\n",
" <td>help-me-write-my-second-novel</td>\n",
" <td>False</td>\n",
" <td>US</td>\n",
" <td>USD</td>\n",
" <td>1243555740</td>\n",
" <td>1243556121</td>\n",
" <td>1240963795</td>\n",
" <td>1240966730</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>kkst1913131122</td>\n",
" <td>Support casting my sculpture in bronze</td>\n",
" <td>I'm nearing completion on a sculpture, current...</td>\n",
" <td>2000.0</td>\n",
" <td>support-casting-my-sculpture-in-bronze</td>\n",
" <td>False</td>\n",
" <td>US</td>\n",
" <td>USD</td>\n",
" <td>1243769880</td>\n",
" <td>1243770317</td>\n",
" <td>1241177914</td>\n",
" <td>1241180541</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" project_id name \\\n",
"0 kkst1451568084 drawing for dollars \n",
"1 kkst1474482071 Sponsor Dereck Blackburn (Lostwars) Artist in ... \n",
"2 kkst183622197 Mr. Squiggles \n",
"3 kkst597742710 Help me write my second novel. \n",
"4 kkst1913131122 Support casting my sculpture in bronze \n",
"\n",
" desc goal \\\n",
"0 I like drawing pictures. and then i color them... 20.0 \n",
"1 I, Dereck Blackburn will be taking upon an inc... 300.0 \n",
"2 So I saw darkpony's successfully funded drawin... 30.0 \n",
"3 Do your part to help out starving artists and ... 500.0 \n",
"4 I'm nearing completion on a sculpture, current... 2000.0 \n",
"\n",
" keywords disable_communication \\\n",
"0 drawing-for-dollars False \n",
"1 sponsor-dereck-blackburn-lostwars-artist-in-re... False \n",
"2 mr-squiggles False \n",
"3 help-me-write-my-second-novel False \n",
"4 support-casting-my-sculpture-in-bronze False \n",
"\n",
" country currency deadline state_changed_at created_at launched_at \\\n",
"0 US USD 1241333999 1241334017 1240600507 1240602723 \n",
"1 US USD 1242429000 1242432018 1240960224 1240975592 \n",
"2 US USD 1243027560 1243027818 1242163613 1242164398 \n",
"3 US USD 1243555740 1243556121 1240963795 1240966730 \n",
"4 US USD 1243769880 1243770317 1241177914 1241180541 \n",
"\n",
" backers_count final_status \n",
"0 3 1 \n",
"1 2 0 \n",
"2 0 0 \n",
"3 18 1 \n",
"4 1 0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## The dataset contains fields like - Name of the project, Description, The goal amount etc as seen above. We will extract text features as well as normalized time features from the given variables. First we extract time features like duration of project, time project changed after launched."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train['time_project']=train['deadline']-train['launched_at']\n",
"train['time_deadline_change']=train['state_changed_at']-train['deadline']\n",
"train['time_launched_change']=train['state_changed_at']-train['launched_at']\n",
"train['time_launched_change']=train['time_launched_change']/1e4\n",
"train['time_created_change']=train['state_changed_at']-train['created_at']\n",
"train['time_created_change']=train['time_created_change']/1e4\n",
"train['time_project']=train['time_project']/1e4\n",
"train['cldiff']=train['created_at']-train['launched_at']\n",
"train['cldiff']=train['cldiff']/1e3\n",
"train['time_deadline_change_log']=train['time_deadline_change'].apply(lambda c: np.log(c))\n",
"train['time_launched_change_log']=train['time_launched_change'].apply(lambda c: np.log(c))\n",
"train['time_created_change_log']=train['time_created_change'].apply(lambda c: np.log(c))\n",
"train['time_project_log']=train['time_project'].apply(lambda c: np.log(c))\n",
"train['year'] = train['deadline'].apply(lambda k: int(time.ctime(k)[-4:])-2009)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test['year'] = test['deadline'].apply(lambda k: int(time.ctime(k)[-4:])-2009)\n",
"\n",
"test['time_project']=test['deadline']-test['launched_at']\n",
"test['time_deadline_change']=test['state_changed_at']-test['deadline']\n",
"test['time_launched_change']=test['state_changed_at']-test['launched_at']\n",
"test['time_launched_change']=test['time_launched_change']/1e4\n",
"test['time_project']=test['time_project']/1e4\n",
"test['time_created_change']=test['state_changed_at']-test['created_at']\n",
"test['time_created_change']=test['time_created_change']/1e4\n",
"test['cldiff']=test['created_at']-test['launched_at']\n",
"test['time_deadline_change_log']=test['time_deadline_change'].apply(lambda c: np.log(c))\n",
"test['time_launched_change_log']=test['time_launched_change'].apply(lambda c: np.log(c))\n",
"test['time_created_change_log']=test['time_created_change'].apply(lambda c: np.log(c))\n",
"test['time_project_log']=test['time_project'].apply(lambda c: np.log(c))\n",
"test['cldiff']=test['cldiff']/1e3"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train['month'] = train['deadline'].apply(lambda k: time.ctime(k).split()[1])\n",
"test['month'] = test['deadline'].apply(lambda k: time.ctime(k).split()[1])\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## We now compute features like goal per unit description length, normalized project goal amount etc. "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"cols_to_use = ['name','desc']\n",
"len_feats = ['name_len','desc_len']\n",
"count_feats = ['name_count','desc_count']\n",
"\n",
"for i in np.arange(2):\n",
" train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)\n",
" train[count_feats[i]] = train[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))\n",
"\n",
"train['keywords_len'] = train['keywords'].apply(str).apply(len)\n",
"train['keywords_count'] = train['keywords'].apply(str).apply(lambda x: len(x.split('-')))\n",
"\n",
"for i in np.arange(2):\n",
" test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)\n",
" test[count_feats[i]] = test[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))\n",
" \n",
"test['keywords_len'] = test['keywords'].apply(str).apply(len)\n",
"test['keywords_count'] = test['keywords'].apply(str).apply(lambda x: len(x.split('-')))\n",
"\n",
"\n",
"# In[921]:\n",
"\n",
"train['keywords_count_norm']=train['keywords_count']-train['keywords_count'].median()\n",
"\n",
"train['name_count_norm']=train['name_count']-train['name_count'].median()\n",
"\n",
"train['desc_count_norm']=train['desc_count']-train['desc_count'].median()\n",
"\n",
"test['keywords_count_norm']=test['keywords_count']-test['keywords_count'].median()\n",
"\n",
"test['name_count_norm']=test['name_count']-test['name_count'].median()\n",
"\n",
"test['desc_count_norm']=test['desc_count']-test['desc_count'].median()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# In[922]:\n",
"\n",
"train['title_word_len']=train[['name_len','name_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
"\n",
"test['title_word_len']=test[['name_len','name_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
"\n",
"\n",
"# In[923]:\n",
"\n",
"train['keyword_word_len']=train[['keywords_len','keywords_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
"\n",
"test['keyword_word_len']=test[['keywords_len','keywords_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
"\n",
"\n",
"# In[924]:\n",
"\n",
"train['desc_word_len']=train[['desc_len','desc_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
"\n",
"test['desc_word_len']=test[['desc_len','desc_count']].apply(lambda k: (k[0]/(k[1]+1)),axis=1)\n",
"\n",
"\n",
"# In[925]:\n",
"\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"feat = ['disable_communication','country','currency']\n",
"for x in feat:\n",
" \n",
" le = LabelEncoder()\n",
" le.fit(list(train[x].values) + list(test[x].values))\n",
" train[x] = le.transform(list(train[x]))\n",
" test[x] = le.transform(list(test[x].values))\n",
" \n",
"\n",
"\n",
"# In[926]:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train['goal_unnormalized'] = train[['goal','currency']].apply(lambda x: c.convert(x[0],le.inverse_transform(int(x[1])),'USD'),axis=1)\n",
"test['goal_unnormalized'] = test[['goal','currency']].apply(lambda x: c.convert(x[0],le.inverse_transform(int(x[1])),'USD'),axis=1)\n",
"\n",
"train['goal_norm']=train['goal_unnormalized'] -train['goal_unnormalized'].median()\n",
"test['goal_norm']=test['goal_unnormalized'] -test['goal_unnormalized'].median()\n",
"\n",
"train['goal_per_day']=train[['goal','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e1,axis=1)\n",
"\n",
"test['goal_per_day']=test[['goal','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e1,axis=1)\n",
"\n",
"\n",
"# In[959]:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train['goalper_descriptionlength']=train[['goal','desc_count']].apply(lambda x: (x[0]/(x[1]+2)),axis=1)\n",
"\n",
"test['goalper_descriptionlength']=test[['goal','desc_count']].apply(lambda x: (x[0]/(x[1]+2)),axis=1)\n",
"\n",
"\n",
"# In[928]:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train['state_change_norm']=train[['time_deadline_change','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e4,axis=1)\n",
"test['state_change_norm']=test[['time_deadline_change','time_project']].apply(lambda x: (x[0]/(x[1]+1))*1e4,axis=1)\n",
"\n",
"\n",
"train['is_12_currency']=train['currency'].apply(lambda x: int(x==12))\n",
"test['is_12_currency']=test['currency'].apply(lambda x: int(x==12))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## We extract text features after cleaning out the stopwords from the data. We use normalized count features of the description length. "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import re\n",
"\n",
"# creating a full list of descriptions from train and etst\n",
"kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)\n",
"\n",
"# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower\n",
"def desc_clean(word):\n",
" p1 = re.sub(pattern='(\\W+)|(\\d+)|(\\s+)',repl=' ',string=word)\n",
" p1 = p1.lower()\n",
" return p1\n",
"\n",
"kickdesc = kickdesc.map(desc_clean)\n",
"\n",
"stop = set(stopwords.words('english'))\n",
"kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]\n",
"\n",
"stemmer = SnowballStemmer(language='english')\n",
"kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]\n",
"\n",
"kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]\n",
"\n",
"kickdesc = [' '.join(x) for x in kickdesc]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"cv = TfidfVectorizer(max_features=375)\n",
"\n",
"alldesc = cv.fit_transform(kickdesc).todense()\n",
"\n",
"#create a data frame\n",
"combine = pd.DataFrame(alldesc)\n",
"combine.rename(columns= lambda x: 'descvariable_'+ str(x), inplace=True)\n",
"\n",
"#split the text features\n",
"\n",
"train_text = combine[:train.shape[0]]\n",
"test_text = combine[train.shape[0]:]\n",
"\n",
"test_text.reset_index(drop=True,inplace=True)\n",
"\n",
"\n",
"#test_text.reset_index(drop=True,inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## We select the subset of features needed for training data "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"cols_to_use = ['name_count_norm'\n",
" ,'desc_count_norm'\n",
" ,'time_project'\n",
" ,'time_deadline_change'\n",
" ,'time_launched_change'\n",
" ,'disable_communication'\n",
" ,'goal_per_day'\n",
" ,'is_12_currency'\n",
" ,'cldiff'\n",
" ,'title_word_len'\n",
" ,'keyword_word_len'\n",
" ,'desc_word_len',\n",
" 'goalper_descriptionlength'\n",
" ]\n",
"\n",
"target = train['final_status']\n",
"\n",
"# data for modeling\n",
"k_train = train[cols_to_use]\n",
"k_test=test[cols_to_use]\n",
"\n",
"\n",
"# In[943]:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(63465, 388)\n"
]
}
],
"source": [
"X_train = pd.concat([k_train,train_text],axis=1)\n",
"X_test=pd.concat([k_test,test_text],axis=1)\n",
"print(X_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.cross_validation import train_test_split"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training Data and Validation Data split "
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_tr,X_val,y_tr,y_val=train_test_split(X_train,target,test_size=0.2,random_state=1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\n",
"\n",
"dTrain = xgb.DMatrix(X_train,target)\n",
"#dVal= xgb.DMatrix(X_val,y_val)\n",
"dTest = xgb.DMatrix(X_test)\n",
"\n",
"xgb_params = {\n",
" 'objective': 'binary:logistic',\n",
" 'booster': 'gbtree',\n",
" 'eval_metric': 'logloss',\n",
" 'eta': 0.025, \n",
" 'max_depth': 7,\n",
" 'lambda': 2,\n",
" 'alpha': 0.02,\n",
" 'subsample': 0.8,\n",
" 'colsample_bytree': 0.8,\n",
" 'min_child_weight': 1,\n",
" 'silent': 1}\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Let us train a XGBoost Model using our features "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\ttrain-logloss:0.68771\n",
"Will train until train-logloss hasn't improved in 40 rounds.\n",
"[50]\ttrain-logloss:0.574593\n",
"[100]\ttrain-logloss:0.548285\n",
"[150]\ttrain-logloss:0.53696\n",
"[200]\ttrain-logloss:0.52961\n",
"[250]\ttrain-logloss:0.523443\n",
"[300]\ttrain-logloss:0.518137\n",
"[350]\ttrain-logloss:0.513683\n",
"[400]\ttrain-logloss:0.509385\n",
"[450]\ttrain-logloss:0.505515\n",
"[500]\ttrain-logloss:0.502177\n",
"[550]\ttrain-logloss:0.498884\n",
"[600]\ttrain-logloss:0.495757\n",
"[650]\ttrain-logloss:0.492827\n",
"[700]\ttrain-logloss:0.489998\n",
"[750]\ttrain-logloss:0.487428\n",
"[800]\ttrain-logloss:0.484896\n",
"[850]\ttrain-logloss:0.482488\n",
"[900]\ttrain-logloss:0.480101\n",
"[950]\ttrain-logloss:0.477878\n",
"[1000]\ttrain-logloss:0.475649\n",
"[1050]\ttrain-logloss:0.47348\n",
"[1100]\ttrain-logloss:0.471171\n",
"[1150]\ttrain-logloss:0.469225\n",
"[1200]\ttrain-logloss:0.46729\n",
"[1250]\ttrain-logloss:0.465523\n"
]
}
],
"source": [
"bst_train=xgb.train(xgb_params, dTrain, 1300, [(dTrain,'train')],\n",
" verbose_eval=50, early_stopping_rounds=40)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pred = bst_train.predict(dTest, ntree_limit=bst_train.best_ntree_limit)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## We now write our predictions to a file and submit "
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# In[970]:\n",
"\n",
"### write the file\n",
"result=[]\n",
"for idx,k in enumerate(pred.tolist()):\n",
" if k>0.4:\n",
" result.append(1)\n",
" else:\n",
" result.append(0)\n",
"nBsub = pd.DataFrame({'project_id':test['project_id'],'final_status':result})\n",
"nBsub = nBsub[['project_id','final_status']]\n",
"nBsub.to_csv(\"nBstarter.csv\",index = False) #0.6526"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment