Skip to content

Instantly share code, notes, and snippets.

@ngbala6
Last active December 11, 2021 23:58
Show Gist options
  • Select an option

  • Save ngbala6/1c28dab2b7f0f691eace81f500470f8a to your computer and use it in GitHub Desktop.

Select an option

Save ngbala6/1c28dab2b7f0f691eace81f500470f8a to your computer and use it in GitHub Desktop.
Model Building
# Importing Libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from collections import Counter
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
# ----- Added Info--------------
# Finding the counts of categorical Columns
train_data['income'].value_counts()
# Finding Isnull of certain column
train_data['capital-loss'].isnull().sum
# train test split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state= 45, shuffle=True, test_size= 0.25)
# One hot encoding
one_hot_encoded_data = pd.get_dummies(train_data , ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income'])
# Finding the counts of categorical Columns
train_data['income'].value_counts()
# dropping the columns which is more occuring
dropped_features = one_hot_encoded_data.drop(['workclass_ Private','education_ HS-grad', 'marital-status_ Married-civ-spouse', 'occupation_ Adm-clerical', 'relationship_ Husband', 'race_ White', 'sex_ Male', 'native-country_ United-States', 'income_ <=50K'], axis=1)
# DELETING THE TARGET if less than 100
ss = df[df['source'] <= 100].index
# After drop, view df
df.drop(ss, inplace=True)
# change null to np.nan
df['bala'] = df['bala'].replace('null', np.nan)
df.dropna()
# For Imbalanced Learn - Oversampling - Categorical and continuous features, don't do encoding for categorical before balance
from imblearn.over_sampling import SMOTENC
smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))
# Categorical Encoder using Catboost
import category_encoders as ce
target_enc = ce.CatBoostEncoder()
target_enc.fit_transform(X_train, Y_train)
# END ADDED INFO ---------------------------
# Importing CSV Files
train_csv = pd.read_csv("train.csv")
# View Info about data like Null values, type of datatype, how many columns
print(train_csv.info())
# View Rows and Columns
print(df.shape)
# Seperating Features and targets
X = train_csv.drop(['result'], axis = 1)
Y = train_csv['result']
# Seperating the data Based on the datatype
numeric = X.select_dtypes(include=['float64'])
string_type = X.select_dtypes(include = ['object'])
print(X.shape)
# print(numeric)
# print(string_type)
# Imputing using mean for null values
mean_impute = SimpleImputer(strategy='mean')
ss = mean_impute.fit_transform(numeric)
# output of ss is basically a numpy array, we convert to dataframe
ss = pd.DataFrame(ss)
# Finding columns of the data
print(string_type.columns)
# Finding the Unique values of the String Columns
print(string_type['unit'].nunique())
# Dropping the Column which is more unique
removed_string_type_id = string_type.drop(['id'], axis = 1)
# Encoding the Categorical Column
encode_category = TargetEncoder()
gg = encode_category.fit_transform(removed_string_type_id, Y)
# Join the Numeric and String Encoded columns
output = gg.join(ss)
model = RandomForestRegressor(n_estimators=500)
model = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, alpha = 10, n_estimators = 1000)
# If you are using Xgboost use this
output = np.ascontiguousarray(output)
Y = np.ascontiguousarray(Y)
# fit the model
model.fit(output, Y)
# for Test dataset changing the formats
test_csv = pd.read_csv("test.csv")
X_Test = test_csv
Y_Test = train_csv['result']
numeric_test = X_Test.select_dtypes(include=['float64'])
string_type_test = X_Test.select_dtypes(include = ['object'])
# encoding the test set - numeric values imputation
ss_test = mean_impute.transform(numeric_test)
ss_test = pd.DataFrame(ss_test)
removed_string_type_id_test = string_type_test.drop(['id'], axis = 1)
# encoding the test set string
gg_test = encode_category.transform(removed_string_type_id_test)
# Join the test set
output_test = gg_test.join(ss_test)
# Features test
output_test = np.ascontiguousarray(output_test)
# Predicting the test set
predictions = model.predict(output_test)
last_df = pd.DataFrame(zip(string_type_test['id'], predictions), columns= ['id', 'result'])
last_df.to_csv("submit.csv", index= False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment