Skip to content

Instantly share code, notes, and snippets.

@dimuthnc
Last active August 2, 2017 17:29
Show Gist options
  • Select an option

  • Save dimuthnc/4027420e54b109eb81815b41f98c821e to your computer and use it in GitHub Desktop.

Select an option

Save dimuthnc/4027420e54b109eb81815b41f98c821e to your computer and use it in GitHub Desktop.
from sklearn import linear_model
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
def evaluate(train_set,features,a):
total_score =0
for x in range(10):
train, test = train_test_split(train_set, train_size = 0.8)
train_data = train[features]
train_target = train.total_cases
test_data = test[features]
test_target = test['total_cases']
testModel = linear_model.Lasso(alpha=a)
testModel.fit(train_data,train_target)
test_results= testModel.predict(test_data)
test_results= [int(round(i)) for i in test_results]
MAE=0
for index in range(0,len(test_results)):
MAE += abs(test_results[index]-test_target[index])
total_score+=(MAE/float(len(test_results)))
#print(MAE)
return total_score/(10.0)
df = pd.read_csv('Data/lag_dengue_features_train.csv', index_col=[0, 1, 2])
df.fillna(method='ffill', inplace=True)
sj = df.loc['sj']
iq = df.loc['iq']
features_sj = ['reanalysis_specific_humidity_g_per_kg','reanalysis_dew_point_temp_k','station_avg_temp_c','reanalysis_max_air_temp_k']
features_iq = ['reanalysis_specific_humidity_g_per_kg','reanalysis_dew_point_temp_k','reanalysis_min_air_temp_k','station_min_temp_c']
sj = sj[features_sj]
iq = iq[features_iq]
df_test = pd.read_csv('Data/lag_dengue_features_test.csv', index_col=[0, 1, 2])
df_test.fillna(method='ffill', inplace=True)
sj_test = df_test.loc['sj']
iq_test = df_test.loc['iq']
sj_test = sj_test[features_sj]
iq_test = iq_test[features_iq]
df_labels = pd.read_csv('Data/lag_dengue_labels_train.csv', index_col=[0, 1, 2])
sj_labels = df_labels.loc['sj']
iq_labels = df_labels.loc['iq']
print("\n Testing models \n")
alphas =[0.1,0.01,0.001,0.0001,0.00001,0.000001,0.0000001,0.00000001]
bestScore_sj =1000
bestScore_iq =1000
bestAlpha_sj =0.1
bestAlpha_iq =0.1
for alpha in alphas:
sj_score = evaluate(sj.join(sj_labels),features_sj,alpha)
if(sj_score<bestScore_sj):
bestScore_sj = sj_score
bestAlpha_sj = alpha
iq_score = evaluate(iq.join(iq_labels),features_iq,alpha)
if(iq_score<bestScore_iq):
bestScore_iq = iq_score
bestAlpha_iq = alpha
print("Best Alpha Values \n")
print (bestAlpha_sj)
print (bestAlpha_iq)
print("Best Score Values \n")
print (bestScore_sj)
print (bestScore_iq)
model_sj = linear_model.Lasso(alpha=bestAlpha_sj)
model_iq = linear_model.Lasso(alpha=bestAlpha_iq)
model_sj.fit(sj.values,sj_labels.total_cases)
model_iq.fit(iq.values,iq_labels.total_cases)
results_sj = model_sj.predict(sj_test)
results_iq = model_iq.predict(iq_test)
results= np.concatenate([results_sj, results_iq])
results =[int(round(i)) for i in results]
results =[0 if(i<0) else i for i in results]
submission = pd.read_csv("Data/submission_format.csv",index_col=[0, 1, 2])
submission.total_cases = results
submission.to_csv("scikit-new_data-lasso.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment