Last active
December 11, 2021 23:58
-
-
Save ngbala6/1c28dab2b7f0f691eace81f500470f8a to your computer and use it in GitHub Desktop.
Model Building
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Importing Libraries | |
| import pandas as pd | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.impute import SimpleImputer | |
| from collections import Counter | |
| from category_encoders import TargetEncoder | |
| from sklearn.metrics import mean_squared_error | |
| import numpy as np | |
| from xgboost import XGBRegressor | |
| from sklearn.model_selection import train_test_split | |
| # ----- Added Info-------------- | |
| # Finding the counts of categorical Columns | |
| train_data['income'].value_counts() | |
| # Finding Isnull of certain column | |
| train_data['capital-loss'].isnull().sum | |
| # train test split | |
| x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state= 45, shuffle=True, test_size= 0.25) | |
| # One hot encoding | |
| one_hot_encoded_data = pd.get_dummies(train_data , ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']) | |
| # Finding the counts of categorical Columns | |
| train_data['income'].value_counts() | |
| # dropping the columns which is more occuring | |
| dropped_features = one_hot_encoded_data.drop(['workclass_ Private','education_ HS-grad', 'marital-status_ Married-civ-spouse', 'occupation_ Adm-clerical', 'relationship_ Husband', 'race_ White', 'sex_ Male', 'native-country_ United-States', 'income_ <=50K'], axis=1) | |
| # DELETING THE TARGET if less than 100 | |
| ss = df[df['source'] <= 100].index | |
| # After drop, view df | |
| df.drop(ss, inplace=True) | |
| # change null to np.nan | |
| df['bala'] = df['bala'].replace('null', np.nan) | |
| df.dropna() | |
| # For Imbalanced Learn - Oversampling - Categorical and continuous features, don't do encoding for categorical before balance | |
| from imblearn.over_sampling import SMOTENC | |
| smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) | |
| X_resampled, y_resampled = smote_nc.fit_resample(X, y) | |
| print(sorted(Counter(y_resampled).items())) | |
| # Categorical Encoder using Catboost | |
| import category_encoders as ce | |
| target_enc = ce.CatBoostEncoder() | |
| target_enc.fit_transform(X_train, Y_train) | |
| # END ADDED INFO --------------------------- | |
| # Importing CSV Files | |
| train_csv = pd.read_csv("train.csv") | |
| # View Info about data like Null values, type of datatype, how many columns | |
| print(train_csv.info()) | |
| # View Rows and Columns | |
| print(df.shape) | |
| # Seperating Features and targets | |
| X = train_csv.drop(['result'], axis = 1) | |
| Y = train_csv['result'] | |
| # Seperating the data Based on the datatype | |
| numeric = X.select_dtypes(include=['float64']) | |
| string_type = X.select_dtypes(include = ['object']) | |
| print(X.shape) | |
| # print(numeric) | |
| # print(string_type) | |
| # Imputing using mean for null values | |
| mean_impute = SimpleImputer(strategy='mean') | |
| ss = mean_impute.fit_transform(numeric) | |
| # output of ss is basically a numpy array, we convert to dataframe | |
| ss = pd.DataFrame(ss) | |
| # Finding columns of the data | |
| print(string_type.columns) | |
| # Finding the Unique values of the String Columns | |
| print(string_type['unit'].nunique()) | |
| # Dropping the Column which is more unique | |
| removed_string_type_id = string_type.drop(['id'], axis = 1) | |
| # Encoding the Categorical Column | |
| encode_category = TargetEncoder() | |
| gg = encode_category.fit_transform(removed_string_type_id, Y) | |
| # Join the Numeric and String Encoded columns | |
| output = gg.join(ss) | |
| model = RandomForestRegressor(n_estimators=500) | |
| model = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, alpha = 10, n_estimators = 1000) | |
| # If you are using Xgboost use this | |
| output = np.ascontiguousarray(output) | |
| Y = np.ascontiguousarray(Y) | |
| # fit the model | |
| model.fit(output, Y) | |
| # for Test dataset changing the formats | |
| test_csv = pd.read_csv("test.csv") | |
| X_Test = test_csv | |
| Y_Test = train_csv['result'] | |
| numeric_test = X_Test.select_dtypes(include=['float64']) | |
| string_type_test = X_Test.select_dtypes(include = ['object']) | |
| # encoding the test set - numeric values imputation | |
| ss_test = mean_impute.transform(numeric_test) | |
| ss_test = pd.DataFrame(ss_test) | |
| removed_string_type_id_test = string_type_test.drop(['id'], axis = 1) | |
| # encoding the test set string | |
| gg_test = encode_category.transform(removed_string_type_id_test) | |
| # Join the test set | |
| output_test = gg_test.join(ss_test) | |
| # Features test | |
| output_test = np.ascontiguousarray(output_test) | |
| # Predicting the test set | |
| predictions = model.predict(output_test) | |
| last_df = pd.DataFrame(zip(string_type_test['id'], predictions), columns= ['id', 'result']) | |
| last_df.to_csv("submit.csv", index= False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment