Last active
May 21, 2016 21:26
-
-
Save VeylanSolmira/0632762d9a721a46acbcf9a74839d3c1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#import xgboost\n", | |
| "import pandas\n", | |
| "from sklearn import preprocessing, cross_validation, grid_search, ensemble, linear_model, calibration\n", | |
| "import numpy\n", | |
| "import math\n", | |
| "#import graphviz" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "with open(\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/train.csv/train.csv\") as f:\n", | |
| " df_train = pandas.read_csv(f)\n", | |
| " df_train_processed = False" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "with open(\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/test.csv/test.csv\") as f:\n", | |
| " df_test = pandas.read_csv(f)\n", | |
| " df_test_processed = False" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def feature_engineering(dataframe):\n", | |
| " #if 'non-na_count' not in dataframe:\n", | |
| " new_df = pandas.DataFrame(dataframe.count(axis = 1), columns = ['non-na_count'])\n", | |
| " df_train = pandas.concat([dataframe, new_df], axis=1)\n", | |
| " #count NAs in row" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def feature_selection(dataframe, threshold):\n", | |
| " list_98 = ['v17', 'v46', 'v26', 'v63', 'v71']\n", | |
| " list_95 = ['v11', 'v25', 'v29', 'v83', 'v41', 'v43', 'v89', 'v64', 'v92', 'v97', 'v108']\n", | |
| " list_90 = ['v8', 'v10', 'v13', 'v15', 'v20', 'v32', 'v33', 'v34', 'v54', 'v67', 'v109']\n", | |
| " list_85 = ['v1', 'v44', 'v55', 'v60', 'v61', 'v76', 'v77', 'v94', 'v105', 'v111', 'v119']\n", | |
| " list_80 = ['v4', 'v9', 'v14', 'v35', 'v51', 'v80', 'v87', 'v101', 'v121']\n", | |
| " list_75 = ['v23', 'v49', 'v65', 'v85', 'v93']\n", | |
| " list_70 = ['v2', 'v7', 'v18', 'v27', 'v48', 'v59', 'v73', 'v84', 'v123']\n", | |
| " list_65 = ['v36', 'v47', 'v117']\n", | |
| " list_60 = ['v45', 'v70', 'v86', 'v98']\n", | |
| " list_55 = ['v5', 'v31', 'v42', 'v58', 'v102']\n", | |
| " list_50 = ['v19', 'v37', 'v38', 'v57', 'v82', 'v95', 'v96', 'v99', 'v103', 'v104', 'v130', 'v106']\n", | |
| " corr_list = [list_98, list_95, list_90, list_85, list_80, list_75, list_70, list_65, list_60, list_55, list_50]\n", | |
| " for elem in corr_list[:threshold]:\n", | |
| " dataframe.drop(elem, axis = 1, inplace = True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def preprocess(threshold):\n", | |
| " #consider dropping: ['v112', 'v125', 'v74', 'v1', 'v110', 'v47']\n", | |
| " global df_train_processed\n", | |
| " global df_test_processed\n", | |
| " if not df_train_processed:\n", | |
| " df_train_processed = True\n", | |
| " #feature_engineering(df_train)\n", | |
| " #feature_selection(df_train, threshold)\n", | |
| " label = df_train['target'].values\n", | |
| " df_train.drop(['target', 'ID'], axis = 1, inplace = True)\n", | |
| " df_train.drop('v22', axis = 1, inplace = True) #'v22' has over 16k values\n", | |
| " \n", | |
| " numeric_columns = df_train.describe().columns.tolist() \n", | |
| "\n", | |
| " dtypes = df_train.dtypes\n", | |
| " non_numeric = list()\n", | |
| " for index, elem in enumerate(dtypes):\n", | |
| " if elem != 'float64':\n", | |
| " non_numeric.append(dtypes.index[index])\n", | |
| " \n", | |
| " if not df_test_processed:\n", | |
| " df_test_processed = True\n", | |
| " #feature_engineering(df_test)\n", | |
| " #feature_selection(df_test, threshold)\n", | |
| " ids = df_test['ID'].values\n", | |
| " df_test.drop('ID', axis = 1, inplace = True)\n", | |
| " df_test.drop('v22', axis = 1, inplace = True)\n", | |
| " \n", | |
| " #remove_correlations()\n", | |
| " return non_numeric, label, ids\n", | |
| "\n", | |
| "non_numeric, label, ids = preprocess(15)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def label_encode(dataframe):\n", | |
| " le = preprocessing.LabelEncoder()\n", | |
| " for elem in non_numeric:\n", | |
| " if elem in dataframe.columns:\n", | |
| " dataframe[elem] = le.fit_transform(dataframe[elem])\n", | |
| " #print len(le.classes_)\n", | |
| "\n", | |
| "label_encode(df_train)\n", | |
| "label_encode(df_test)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def remove_correlations(lower):\n", | |
| " corr = df_train.corr()\n", | |
| " drop = list()\n", | |
| " for row in xrange(corr.shape[0]):\n", | |
| " for column in xrange(corr.shape[1]):\n", | |
| " if lower < corr.ix[row, column] and row < column:\n", | |
| " drop.append((row, column))\n", | |
| " return drop\n", | |
| " \n", | |
| "drop_tuples = remove_correlations(0.98)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def ids_label_save():\n", | |
| " numpy.savetxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/label\", label)\n", | |
| " numpy.savetxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/ids\", ids)\n", | |
| "\n", | |
| "ids_label_save()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def ids_label_load():\n", | |
| " label = numpy.loadtxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/label\")\n", | |
| " ids = numpy.loadtxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/ids\")\n", | |
| "\n", | |
| "ids_label_load()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def onehot(dataframe):\n", | |
| " enc = preprocessing.OneHotEncoder()\n", | |
| " onehot_df_start = pandas.DataFrame(enc.fit_transform(dataframe[[non_numeric[0]]]).todense())\n", | |
| " onehot_df_start.columns = map(lambda x: non_numeric[0] + '_' + str(x), onehot_df_start.columns.tolist())\n", | |
| " for elem in non_numeric[1:]:\n", | |
| " if elem in dataframe.columns:\n", | |
| " onehot_df = pandas.DataFrame(enc.fit_transform(dataframe[[elem]]).todense())\n", | |
| " onehot_df.columns = map(lambda x: elem + '_' + str(x), onehot_df.columns.tolist())\n", | |
| " dataframe.drop(elem, axis = 1, inplace = True)\n", | |
| " #print len(onehot_df_start.columns)\n", | |
| " onehot_df_start = onehot_df_start.join(onehot_df)\n", | |
| " #print len(onehot_df_start.columns)\n", | |
| " dataframe = dataframe.join(onehot_df_start)\n", | |
| " return dataframe\n", | |
| "\n", | |
| "#I got worse results with one-hot encoding v. label encoding\n", | |
| "#df_train = onehot(df_train)\n", | |
| "#df_test = onehot(df_test)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def save_dataframe(dataframe, path):\n", | |
| " dataframe.to_csv(path)\n", | |
| " \n", | |
| "def load_dataframe(path)\n", | |
| " return pandas.read_csv(path)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "save_dataframe(df_train, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_train_label_encoded\")\n", | |
| "save_dataframe(df_test, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_test_label_encoded\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_train = load_dataframe(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_train_onehot.csv\")\n", | |
| "df_test = load_dataframe(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_test_onehot.csv\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def train_test_columns():\n", | |
| " train_columns = set(df_train.columns.tolist())\n", | |
| " test_columns = set(df_test.columns.tolist())\n", | |
| " #train_columns - test_columns\n", | |
| " df_train.drop(train_columns - test_columns, axis = 1, inplace = True)\n", | |
| " \n", | |
| "train_test_columns()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#this may not current work\n", | |
| "def boost_load():\n", | |
| " bst.load_model(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/xgboost_model_1000rounds\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def store_logit(output, path, column):\n", | |
| " def logit(x):\n", | |
| " return (math.log(x) - math.log(1 - x))\n", | |
| "\n", | |
| " #create new column for df_train that are the logit(prediction) from earlier model\n", | |
| " logit_list = list()\n", | |
| " for elem in output.tolist():\n", | |
| " try:\n", | |
| " logit_list.append(logit(elem[1]))\n", | |
| " except ValueError:\n", | |
| " logit_list.append(10)\n", | |
| " pandas.DataFrame(logit_list, columns=[column]).to_csv(path)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def attach_logit(dataframe, paths):\n", | |
| " values = pandas.read_csv(paths[0])[[1]]\n", | |
| " for path in paths[1:]:\n", | |
| " new_df = pandas.read_csv(path)[[1]]\n", | |
| " values = pandas.concat([values, new_df], axis=1)\n", | |
| " return pandas.concat([dataframe, values], axis=1)\n", | |
| " \n", | |
| "train_logit_list = ['logit_column_extra_trees_train', 'logit_column_logistic_train']\n", | |
| "df_train = attach_logit(df_train, train_logit_list)\n", | |
| "test_logit_list = ['logit_column_extra_trees_test', 'logit_column_logistic_test']\n", | |
| "df_test = attach_logit(df_test, test_logit_list)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def xgboost(na, path):\n", | |
| " dtrain = xgboost.DMatrix(df_train.fillna(na), label = label, missing = na)\n", | |
| " dtest = xgboost.DMatrix(df_test.fillna(na), label = label, missing = na)\n", | |
| " \n", | |
| " param = {'max_depth':10,\n", | |
| " 'eta':0.1,\n", | |
| " 'objective':'binary:logistic',\n", | |
| " #'base_score': 0.76,\n", | |
| " 'eval_metric': 'logloss',\n", | |
| " 'subsample': 0.75, \n", | |
| " 'colsample_bytree': 0.7}\n", | |
| " \n", | |
| " num_round = 20 #45-60 min for 1000\n", | |
| " #watchlist = [(dtest,'eval'), (dtrain,'train')]\n", | |
| " \n", | |
| " bst = xgboost.train(param, dtrain, num_round)#, watchlist)\n", | |
| " bst.save_model(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/\" + path)\n", | |
| " output = bst.predict(dtest)\n", | |
| " return output\n", | |
| " \n", | |
| "xgboost(-999, \"xgboost_model_1000rounds\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def xgboost_plot():\n", | |
| " #xgboost.plot_importance(bst)\n", | |
| " #xgboost.plot_tree(bst, num_trees=2)\n", | |
| " #xgb.to_graphviz(bst, num_trees=2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def xgboost_cv():\n", | |
| " bst_cv = xgboost.cv(param, dtrain, num_boost_round = 10, nfold = 5)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def sklearn_ml_model(algorithm, criterion = 'entropy'):\n", | |
| " params = {}\n", | |
| " if algorithm == 'logistic':\n", | |
| " model = linear_model.LogisticRegression()\n", | |
| " elif algorithm == 'random_forest':\n", | |
| " params = {'n_estimators':25,\n", | |
| " 'n_jobs':3,\n", | |
| " 'max_features':50,\n", | |
| " 'criterion':criterion,\n", | |
| " 'min_samples_split':4,\n", | |
| " 'max_depth':50,\n", | |
| " 'min_samples_leaf':4}\n", | |
| " model = ensemble.RandomForestClassifier(**params)\n", | |
| " elif algorithm == 'extra_trees':\n", | |
| " #n_estimators=850,max_features= 60,criterion= 'entropy',min_samples_split= 4,\n", | |
| " # max_depth= 40, min_samples_leaf= 2, n_jobs = -1) \n", | |
| " params = {'n_estimators':25,\n", | |
| " 'n_jobs':3,\n", | |
| " 'max_features':45,\n", | |
| " 'criterion':criterion,\n", | |
| " 'min_samples_split':4,\n", | |
| " 'max_depth':50,\n", | |
| " 'min_samples_leaf':4}\n", | |
| " model = ensemble.ExtraTreesClassifier(**params)\n", | |
| " elif algorithm == 'gradient_boosting':\n", | |
| " model = ensemble.GradientBoostingClassifier()\n", | |
| " elif algorithm == 'naive_bayes':\n", | |
| " from sklearn.naive_bayes import BernoulliNB\n", | |
| " model = BernoulliNB()\n", | |
| " elif algorithm == 'svm':\n", | |
| " from sklearn import svm\n", | |
| " model = svm.NuSVC(nu = 0.1)\n", | |
| " return model, params" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def na_find(train_dataframe, test_dataframe, na):\n", | |
| " if na == 'median':\n", | |
| " train_na = train_dataframe.median()\n", | |
| " test_na = test_dataframe.median()\n", | |
| " else:\n", | |
| " train_na = test_na = na\n", | |
| " return train_na, test_na" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def sklearn_ml(algorithm, na, logit_path):\n", | |
| " train_na, test_na = na_find(df_train, df_test, na)\n", | |
| " \n", | |
| " model, params = sklearn_ml_model(algorithm)\n", | |
| " \n", | |
| " model.fit(df_train.fillna(train_na), label)\n", | |
| " output_train = model.predict_proba(df_train.fillna(train_na))\n", | |
| " store_logit(output_train, logit_path + '_train', algorithm)\n", | |
| " output_test = model.predict_proba(df_test.fillna(test_na))\n", | |
| " store_logit(output_test, logit_path + '_test', algorithm)\n", | |
| " results_save(output_test, 'predictions_' + algorithm + str(params.items()) + '.csv')\n", | |
| " return output_test" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#2-3 minutes\n", | |
| "#sklearn_ml('logistic', 'median', \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_logistic_11Mar\")\n", | |
| "\n", | |
| "#sklearn_ml('random_forest', -999, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_rf\")\n", | |
| "\n", | |
| "#~100 minutes\n", | |
| "sklearn_ml('extra_trees', -999, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_extra_trees_11Mar\")\n", | |
| "\n", | |
| "#\n", | |
| "#sklearn_ml('naive_bayes', 'median', \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_nb\")\n", | |
| "#sklearn_ml('gradient_boosting', -999, )\n", | |
| "#sklearn_ml('svm', 'median')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def cross_validation_sklearn(algorithm, na):\n", | |
| " #manual\n", | |
| " #Xtrain, Xtest, ytrain, ytest = cross_validation.train_test_split(df_train, df_test, test_size=0.20, random_state=0)\n", | |
| " model, params = sklearn_ml_model(algorithm)\n", | |
| " print model\n", | |
| " scores = cross_validation.cross_val_score(model, df_train.fillna(na), label, cv = 5, scoring = 'log_loss')\n", | |
| " print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)\n", | |
| " return scores\n", | |
| " \n", | |
| "#scores = cross_validation_sklearn('logistic', df_train.median())\n", | |
| "#scores = cross_validation_sklearn('naive_bayes', df_train.median())\n", | |
| "scores = cross_validation_sklearn('extra_trees', -999)\n", | |
| "#scores = cross_validation_sklearn('svm', df_train.median())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def results_save(output, path):\n", | |
| " #with open('/home/sunshine/xgboost_labelencode_100rounds_base_score0.76', 'w') as f:\n", | |
| " with open('C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/' + path, 'w') as f:\n", | |
| " f.write('ID,PredictedProb\\n')\n", | |
| " for index in xrange(len(output)):\n", | |
| " f.write('{ID},{value}\\n'.format(ID = int(ids[index]), value = output[index][1]))\n", | |
| " \n", | |
| "results_save(output_clf, 'predictions_extratrees_250features_calibrated_with_validation_set')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def report(grid_scores, n_top=3):\n", | |
| " #http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html#example-model-selection-randomized-search-py\n", | |
| " import operator\n", | |
| " top_scores = sorted(grid_scores, key=operator.itemgetter(1), reverse=True)[:n_top]\n", | |
| " for i, score in enumerate(top_scores):\n", | |
| " print \"Model with rank: {0}\".format(i + 1)\n", | |
| " print \"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", | |
| " score.mean_validation_score,\n", | |
| " numpy.std(score.cv_validation_scores))\n", | |
| " print \"Parameters: {0}\".format(score.parameters)\n", | |
| " print \"\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#random grid search\n", | |
| "def grid_search_sklearn(algorithm, random, na):\n", | |
| " from scipy.stats import randint as sp_randint\n", | |
| " if algorithm == 'logistic':\n", | |
| " model = linear_model.LogisticRegression()\n", | |
| " params = {\n", | |
| " 'penalty': ['l1', 'l2'], \n", | |
| " 'C': [1, 0.1, 0.01],\n", | |
| " 'fit_intercept': [True, False],\n", | |
| " 'n_jobs': [3]}\n", | |
| " if algorithm == 'extra_trees':\n", | |
| " model = ensemble.ExtraTreesClassifier()\n", | |
| " params = {'n_estimators': sp_randint(25, 200),\n", | |
| " \"max_depth\": sp_randint(10, 50),\n", | |
| " \"max_features\": sp_randint(10, 110),\n", | |
| " \"min_samples_split\": sp_randint(3, 6),\n", | |
| " \"min_samples_leaf\": sp_randint(3, 6),\n", | |
| " \"bootstrap\": [True, False],\n", | |
| " \"criterion\": [\"gini\", \"entropy\"],\n", | |
| " 'n_jobs': [3]}\n", | |
| " n_iter_search = 6\n", | |
| " \n", | |
| " if random == 'random':\n", | |
| " search = grid_search.RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter_search)\n", | |
| " else:\n", | |
| " search = grid_search.GridSearchCV(model, param_distributions=params, n_iter=n_iter_search)\n", | |
| " search.fit(df_train.fillna(na), label)\n", | |
| " report(search.grid_scores_)\n", | |
| "\n", | |
| "#grid_search_sklearn('logistic', 'random', df_train.median()) #~10 minutes\n", | |
| "grid_search_sklearn('extra_trees', 'random', -999) #~10 minutes" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def calibrated_classifier(dataframe, index, algorithm, na, model = None):\n", | |
| " from sklearn.metrics import log_loss\n", | |
| "\n", | |
| " #https://github.com/christophebourguignat/notebooks/blob/master/Calibration.ipynb\n", | |
| " xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(dataframe, index, test_size=0.20, random_state=0)\n", | |
| " #xtrain_valid, xtest, ytrain_valid, ytest = cross_validation.train_test_split(dataframe, label, test_size=0.20, random_state=0)\n", | |
| " #xtrain, xvalid, ytrain, yvalid = cross_validation.train_test_split(xtrain_valid, ytrain_valid, test_size=0.25, random_state=0)\n", | |
| " if model == None:\n", | |
| " model, params = sklearn_ml_model(algorithm)\n", | |
| " train_na, test_na = na_find(xtrain, xtest, na)\n", | |
| " #model.fit(xtrain.fillna(na), ytrain)\n", | |
| " #ypreds = model.predict_proba(xtest.fillna(na))\n", | |
| " # in our case, 'isotonic' works better than default 'sigmoid'\n", | |
| " calibrated_clf = calibration.CalibratedClassifierCV(model, method='isotonic', cv=5) #cv = 5\n", | |
| " calibrated_clf.fit(xtrain.fillna(train_na), ytrain)\n", | |
| " #ypreds = calibrated_clf.predict_proba(xtest.fillna(test_na))\n", | |
| " #print \"%.4f\" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)\n", | |
| " return calibrated_clf\n", | |
| "\n", | |
| "#~5 minutes for nfeatures == 25\n", | |
| "#calibrated_clf = calibrated_classifier(df_train, label, \"extra_trees\", -999)\n", | |
| "#calibrated_clf = calibrated_classifier(df_train, label, \"logistic\", 'median')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def bagging_classifier(algorithm, na):\n", | |
| " from sklearn.metrics import log_loss\n", | |
| " xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(df_train, label, test_size=0.20, random_state=0)\n", | |
| " \n", | |
| " model, params = sklearn_ml_model(algorithm)\n", | |
| " \n", | |
| " clfbag = ensemble.BaggingClassifier(model, n_estimators=5)\n", | |
| " clfbag.fit(Xtrain.fillna(na), ytrain)\n", | |
| " ypreds = clfbag.predict_proba(Xtest.fillna(na))\n", | |
| " print \"%.4f\" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)\n", | |
| " return clfbag\n", | |
| " \n", | |
| "clf_bag = bagging_classifier(\"extra_trees\", -999)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def average_predictions(paths):\n", | |
| " #with open(paths) as f:\n", | |
| " values = pandas.read_csv(paths[0], names = '0', skiprows = 1)\n", | |
| " for index, path in enumerate(paths[1:]):\n", | |
| " new_df = pandas.read_csv(path, names = str(index + 1), skiprows = 1)\n", | |
| " values = pandas.concat([values, new_df], axis=1)\n", | |
| " values.mean(axis = 1).to_csv(\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/average.csv\")\n", | |
| "\n", | |
| "files = [\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/extratrees\",\n", | |
| " \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/preds_blend.csv\",\n", | |
| " \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/xgboost_labelencode_100rounds\", \n", | |
| " \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/predictions_extratrees_250features_calibrated\"]\n", | |
| "average_predictions(files)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def stacking(algorithm, na):\n", | |
| " x_a, x_b, y_a, y_b = cross_validation.train_test_split(df_train, label, test_size=0.50, random_state=0)\n", | |
| " model, params = sklearn_ml_model(algorithm)\n", | |
| " \n", | |
| " #split training set into folds, train on folds, predict out of fold\n", | |
| " x_a_na, x_b_na = na_find(x_a, x_b, na)\n", | |
| " model.fit(x_a.fillna(x_a_na), y_a)\n", | |
| " x_b_preds = model.predict_proba(x_b.fillna(x_b_na))\n", | |
| " model.fit(x_b.fillna(x_b_na), y_b)\n", | |
| " x_a_preds = model.predict_proba(x_a.fillna(x_a_na))\n", | |
| " \n", | |
| " #make new column of predictions for training set\n", | |
| " x_a_df = pandas.DataFrame(x_a_preds[:,1], columns = [algorithm])\n", | |
| " x_b_df = pandas.DataFrame(x_b_preds[:,1], columns = [algorithm])\n", | |
| " new_df = pandas.concat([x_a_df, x_b_df], axis=0)\n", | |
| " return new_df\n", | |
| " \n", | |
| " train_na, test_na = na_find(df_train, df_test, na)\n", | |
| " model.fit(df_train.fillna(train_na), label)\n", | |
| " test_preds = model.predict_proba(df_test.fillna(test_na))\n", | |
| " \n", | |
| " #make new column of predictions for test set\n", | |
| " test_df = pandas.DataFrame(test_preds[:,1], columns = [algorithm])\n", | |
| "\n", | |
| " def combiner(algorithm):\n", | |
| " model, params = sklearn_ml_model(algorithm)\n", | |
| " #train_na, test_na = na_find(df_train, df_test, na)\n", | |
| " model.fit(new_df, label)\n", | |
| " final_preds = model.predict_proba(test_df)\n", | |
| " return final_preds\n", | |
| " \n", | |
| " return combiner('logistic') #try logistic regression here\n", | |
| "\n", | |
| "\n", | |
| "output = stacking('extra_trees', -999)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from __future__ import division\n", | |
| "\n", | |
| "def blending(na):\n", | |
| " numpy.random.seed(0) # seed to shuffle the train set\n", | |
| " \n", | |
| " global label, df_train, df_test\n", | |
| " #print len(df_train), len(label)\n", | |
| " n_folds = 10\n", | |
| " verbose = True\n", | |
| " shuffle = False\n", | |
| "\n", | |
| " #X, y, X_submission = load_data.load()\n", | |
| "\n", | |
| " if shuffle:\n", | |
| " idx = np.random.permutation(label.size)\n", | |
| " df_train = df_train[idx]\n", | |
| " label = label[idx]\n", | |
| "\n", | |
| " skf = list(cross_validation.StratifiedKFold(label, n_folds))\n", | |
| "\n", | |
| " clfs = [ensemble.RandomForestClassifier(n_estimators=100, n_jobs=3, criterion='gini')]#,\n", | |
| " #ensemble.RandomForestClassifier(n_estimators=100, n_jobs=3, criterion='entropy'),\n", | |
| " #ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=3, criterion='gini'),\n", | |
| " #ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=3, criterion='entropy'),\n", | |
| " #ensemble.GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]\n", | |
| "\n", | |
| " print \"Creating train and test sets for blending.\"\n", | |
| " \n", | |
| " dataset_blend_train = numpy.zeros((df_train.shape[0], len(clfs)))\n", | |
| " dataset_blend_test = numpy.zeros((df_test.shape[0], len(clfs)))\n", | |
| " \n", | |
| " for j, clf in enumerate(clfs):\n", | |
| " print j, clf\n", | |
| " dataset_blend_test_j = numpy.zeros((df_test.shape[0], len(skf)))\n", | |
| " for i, (train, test) in enumerate(skf):\n", | |
| " print \"Fold\", i\n", | |
| " X_train = df_train.loc[train, :]\n", | |
| " y_train = label[train]\n", | |
| " X_test = df_train.loc[test, :]\n", | |
| " y_test = label[test]\n", | |
| " cal_clf = calibrated_classifier(X_train, y_train, '', na, clf)\n", | |
| " cal_clf.fit(X_train.fillna(-999), y_train)\n", | |
| " y_submission = cal_clf.predict_proba(X_test.fillna(-999))[:,1]\n", | |
| " dataset_blend_train[test, j] = y_submission\n", | |
| " dataset_blend_test_j[:, i] = cal_clf.predict_proba(df_test.fillna(-999))[:,1]\n", | |
| " dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)\n", | |
| "\n", | |
| " print\n", | |
| " print \"Blending.\"\n", | |
| " clf = linear_model.LogisticRegression()\n", | |
| " clf.fit(dataset_blend_train, label)\n", | |
| " y_submission = clf.predict_proba(dataset_blend_test)[:,1]\n", | |
| "\n", | |
| " print \"Linear stretch of predictions to [0,1]\"\n", | |
| " y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())\n", | |
| "\n", | |
| " print \"Saving Results.\"\n", | |
| " numpy.savetxt(fname='blend_cal.csv', X=y_submission, fmt='%0.9f')\n", | |
| " numpy.savetxt(fname='dataset_blend_train_cal', X=dataset_blend_train, fmt='%0.9f')\n", | |
| " numpy.savetxt(fname='dataset_blend_test_cal', X=dataset_blend_test, fmt='%0.9f')\n", | |
| " return y_submission\n", | |
| "\n", | |
| "#~4 hours 3:37\n", | |
| "train = blending(-999)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "a = numpy.loadtxt(fname='blend.csv')\n", | |
| "\n", | |
| "ids_df = pandas.DataFrame(ids)\n", | |
| "\n", | |
| "new_df = pandas.concat([ids_df, a], axis=1)\n", | |
| "\n", | |
| "new_df.to_csv(\"preds_blend\", index = False)\n", | |
| "#still need first line of text" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.11" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment