ngbala6/Simplifygist.py

## Simplifygist.py
# Importing Libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from collections import Counter
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# ----- Added Info--------------

# Finding the counts of categorical Columns
train_data['income'].value_counts()

# Finding Isnull of certain column
train_data['capital-loss'].isnull().sum

# train test split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state= 45, shuffle=True, test_size= 0.25)

# One hot encoding
one_hot_encoded_data = pd.get_dummies(train_data , ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income'])

# Finding the counts of categorical Columns
train_data['income'].value_counts()

# dropping the columns which is more occuring
dropped_features = one_hot_encoded_data.drop(['workclass_ Private','education_ HS-grad', 'marital-status_ Married-civ-spouse', 'occupation_ Adm-clerical', 'relationship_ Husband', 'race_ White', 'sex_ Male', 'native-country_ United-States', 'income_ <=50K'], axis=1)

# DELETING THE TARGET if less than 100
ss = df[df['source'] <= 100].index

# After drop, view df
df.drop(ss, inplace=True)

# change null to np.nan
df['bala'] = df['bala'].replace('null', np.nan)

df.dropna()

# For Imbalanced Learn - Oversampling - Categorical and continuous features, don't do encoding for categorical before balance
from imblearn.over_sampling import SMOTENC
smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

# Categorical Encoder using Catboost
import category_encoders as ce

target_enc = ce.CatBoostEncoder()
target_enc.fit_transform(X_train, Y_train)

# END ADDED INFO ---------------------------


# Importing CSV Files
train_csv = pd.read_csv("train.csv")

# View Info about data like Null values, type of datatype, how many columns
print(train_csv.info())

# View Rows and Columns
print(df.shape)

# Seperating Features and targets
X = train_csv.drop(['result'], axis = 1)
Y = train_csv['result']

# Seperating the data Based on the datatype
numeric = X.select_dtypes(include=['float64'])
string_type = X.select_dtypes(include = ['object'])

print(X.shape)

# print(numeric)
# print(string_type)

# Imputing using mean for null values
mean_impute = SimpleImputer(strategy='mean')

ss = mean_impute.fit_transform(numeric)
# output of ss is basically a numpy array, we convert to dataframe
ss = pd.DataFrame(ss)

# Finding columns of the data
print(string_type.columns)

# Finding the Unique values of the String Columns
print(string_type['unit'].nunique())

# Dropping the Column which is more unique
removed_string_type_id = string_type.drop(['id'], axis = 1)

# Encoding the Categorical Column
encode_category = TargetEncoder()
gg = encode_category.fit_transform(removed_string_type_id, Y)

# Join the Numeric and String Encoded columns
output = gg.join(ss)

model = RandomForestRegressor(n_estimators=500)

model = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, alpha = 10, n_estimators = 1000)


# If you are using Xgboost use this
output = np.ascontiguousarray(output)
Y = np.ascontiguousarray(Y)

# fit the model
model.fit(output, Y)

# for Test dataset changing the formats
test_csv = pd.read_csv("test.csv")

X_Test = test_csv
Y_Test = train_csv['result']

numeric_test = X_Test.select_dtypes(include=['float64'])
string_type_test = X_Test.select_dtypes(include = ['object'])

# encoding the test set - numeric values imputation
ss_test = mean_impute.transform(numeric_test)

ss_test = pd.DataFrame(ss_test)

removed_string_type_id_test = string_type_test.drop(['id'], axis = 1)

# encoding the test set string
gg_test = encode_category.transform(removed_string_type_id_test)

# Join the test set
output_test = gg_test.join(ss_test)

# Features test
output_test = np.ascontiguousarray(output_test)

# Predicting the test set
predictions = model.predict(output_test)

last_df = pd.DataFrame(zip(string_type_test['id'], predictions), columns= ['id', 'result'])

last_df.to_csv("submit.csv", index= False)
	# Importing Libraries
	import pandas as pd
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.impute import SimpleImputer
	from collections import Counter
	from category_encoders import TargetEncoder
	from sklearn.metrics import mean_squared_error
	import numpy as np
	from xgboost import XGBRegressor
	from sklearn.model_selection import train_test_split

	# ----- Added Info--------------

	# Finding the counts of categorical Columns
	train_data['income'].value_counts()

	# Finding Isnull of certain column
	train_data['capital-loss'].isnull().sum

	# train test split
	x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state= 45, shuffle=True, test_size= 0.25)

	# One hot encoding
	one_hot_encoded_data = pd.get_dummies(train_data , ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income'])

	# Finding the counts of categorical Columns
	train_data['income'].value_counts()

	# dropping the columns which is more occuring
	dropped_features = one_hot_encoded_data.drop(['workclass_ Private','education_ HS-grad', 'marital-status_ Married-civ-spouse', 'occupation_ Adm-clerical', 'relationship_ Husband', 'race_ White', 'sex_ Male', 'native-country_ United-States', 'income_ <=50K'], axis=1)

	# DELETING THE TARGET if less than 100
	ss = df[df['source'] <= 100].index

	# After drop, view df
	df.drop(ss, inplace=True)

	# change null to np.nan
	df['bala'] = df['bala'].replace('null', np.nan)

	df.dropna()

	# For Imbalanced Learn - Oversampling - Categorical and continuous features, don't do encoding for categorical before balance
	from imblearn.over_sampling import SMOTENC
	smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
	X_resampled, y_resampled = smote_nc.fit_resample(X, y)
	print(sorted(Counter(y_resampled).items()))

	# Categorical Encoder using Catboost
	import category_encoders as ce

	target_enc = ce.CatBoostEncoder()
	target_enc.fit_transform(X_train, Y_train)

	# END ADDED INFO ---------------------------




	# Importing CSV Files
	train_csv = pd.read_csv("train.csv")

	# View Info about data like Null values, type of datatype, how many columns
	print(train_csv.info())

	# View Rows and Columns
	print(df.shape)

	# Seperating Features and targets
	X = train_csv.drop(['result'], axis = 1)
	Y = train_csv['result']

	# Seperating the data Based on the datatype
	numeric = X.select_dtypes(include=['float64'])
	string_type = X.select_dtypes(include = ['object'])

	print(X.shape)

	# print(numeric)
	# print(string_type)

	# Imputing using mean for null values
	mean_impute = SimpleImputer(strategy='mean')

	ss = mean_impute.fit_transform(numeric)
	# output of ss is basically a numpy array, we convert to dataframe
	ss = pd.DataFrame(ss)

	# Finding columns of the data
	print(string_type.columns)

	# Finding the Unique values of the String Columns
	print(string_type['unit'].nunique())

	# Dropping the Column which is more unique
	removed_string_type_id = string_type.drop(['id'], axis = 1)

	# Encoding the Categorical Column
	encode_category = TargetEncoder()
	gg = encode_category.fit_transform(removed_string_type_id, Y)

	# Join the Numeric and String Encoded columns
	output = gg.join(ss)

	model = RandomForestRegressor(n_estimators=500)

	model = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, alpha = 10, n_estimators = 1000)


	# If you are using Xgboost use this
	output = np.ascontiguousarray(output)
	Y = np.ascontiguousarray(Y)

	# fit the model
	model.fit(output, Y)

	# for Test dataset changing the formats
	test_csv = pd.read_csv("test.csv")

	X_Test = test_csv
	Y_Test = train_csv['result']

	numeric_test = X_Test.select_dtypes(include=['float64'])
	string_type_test = X_Test.select_dtypes(include = ['object'])

	# encoding the test set - numeric values imputation
	ss_test = mean_impute.transform(numeric_test)

	ss_test = pd.DataFrame(ss_test)

	removed_string_type_id_test = string_type_test.drop(['id'], axis = 1)

	# encoding the test set string
	gg_test = encode_category.transform(removed_string_type_id_test)

	# Join the test set
	output_test = gg_test.join(ss_test)

	# Features test
	output_test = np.ascontiguousarray(output_test)

	# Predicting the test set
	predictions = model.predict(output_test)

	last_df = pd.DataFrame(zip(string_type_test['id'], predictions), columns= ['id', 'result'])

	last_df.to_csv("submit.csv", index= False)
No results found