Alvira Swalin aswalin

## Shap_Values
shap_values = model.get_feature_importance(Pool(X_test, label=y_test,cat_features=categorical_features_indices),
                                                                     type="ShapValues")
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

shap.initjs()
shap.force_plot(expected_value, shap_values[3,:], X_test.iloc[3,:])

## catboost.ipynb

      
              1 file
            
          
              8 forks
            
          
                0 comments
              
            
              10 stars
            
          
                aswalin
                / catboost.ipynb
            
            
              Last active
              February 4, 2024 00:35
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## catboost_feature_imp
cb = CatBoostRegressor()
cb.get_feature_importance(type= "___")

 "type" possible values:
  - PredictionValuesChange
  - LossFunctionChange
  - FeatureImportance
      PredictionValuesChange for non-ranking metrics and LossFunctionChange for ranking metrics
  - ShapValues
      Calculate SHAP Values for every object

## catboost
from catboost import *

train_data = [["a", 1, 1], [ "b", 3, 0], [ "a", 3, 1]]
test_data = [[ "a", 1, 2]]
train_labels = [10, 20, 30]
model = CatBoostRegressor(iterations=10)
model.fit(train_data, train_labels)

## Infersent-Sentence Embedding
import torch

infersent = torch.load('InferSent/encoder/infersent.allnli.pickle', map_location=lambda storage, loc: storage)
infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")

infersent.build_vocab(sentences, tokenize=True)

dict_embeddings = {}
for i in range(len(sentences)):
    print(i)

## ROC-AUC vs F1
from sklearn import metrics
import numpy as np

y_true = np.concatenate((np.ones(100), np.zeros(900)))

a = np.random.uniform(0.5,1, 5)
b = np.random.uniform(0,0.5, 995)
y_pred1 = np.concatenate((a,b))

a = np.random.uniform(0.5,1, 90)

## BLEU Score
from nltk.translate.bleu_score import sentence_bleu
reference = [['the', 'cat',"is","sitting","on","the","mat"]]
candidate = ["on",'the',"mat","is","a","cat"]
score = sentence_bleu(  reference, candidate)
print(score)


from nltk.translate.bleu_score import sentence_bleu
reference = [['the', 'cat',"is","sitting","on","the","mat"]]
candidate = ["there",'is',"cat","sitting","cat"]

## Linear_Regression_Python
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model

def metrics(m,X,y):
    yhat = m.predict(X)
    print(yhat)
    SS_Residual = sum((y-yhat)**2)
    SS_Total = sum((y-np.mean(y))**2)
    r_squared = 1 - (float(SS_Residual))/SS_Total

## CatBoost
import catboost as cb
cat_features_index = [0,1,2,3,4,5,6]

def auc(m, train, test):
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

params = {'depth': [4, 7, 10],
          'learning_rate' : [0.03, 0.1, 0.15],
         'l2_leaf_reg': [1,4,9],

## Data Preparation
import pandas as pd, numpy as np, time
from sklearn.model_selection import train_test_split

data = pd.read_csv("flights.csv")
data = data.sample(frac = 0.1, random_state=10)

data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
data.dropna(inplace=True)
	shap_values = model.get_feature_importance(Pool(X_test, label=y_test,cat_features=categorical_features_indices),
	type="ShapValues")
	expected_value = shap_values[0,-1]
	shap_values = shap_values[:,:-1]

	shap.initjs()
	shap.force_plot(expected_value, shap_values[3,:], X_test.iloc[3,:])
	cb = CatBoostRegressor()
	cb.get_feature_importance(type= "___")

	"type" possible values:
	- PredictionValuesChange
	- LossFunctionChange
	- FeatureImportance
	PredictionValuesChange for non-ranking metrics and LossFunctionChange for ranking metrics
	- ShapValues
	Calculate SHAP Values for every object
	from catboost import *

	train_data = [["a", 1, 1], [ "b", 3, 0], [ "a", 3, 1]]
	test_data = [[ "a", 1, 2]]
	train_labels = [10, 20, 30]
	model = CatBoostRegressor(iterations=10)
	model.fit(train_data, train_labels)
	import torch

	infersent = torch.load('InferSent/encoder/infersent.allnli.pickle', map_location=lambda storage, loc: storage)
	infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")

	infersent.build_vocab(sentences, tokenize=True)

	dict_embeddings = {}
	for i in range(len(sentences)):
	print(i)
	from sklearn import metrics
	import numpy as np

	y_true = np.concatenate((np.ones(100), np.zeros(900)))

	a = np.random.uniform(0.5,1, 5)
	b = np.random.uniform(0,0.5, 995)
	y_pred1 = np.concatenate((a,b))

	a = np.random.uniform(0.5,1, 90)
	from nltk.translate.bleu_score import sentence_bleu
	reference = [['the', 'cat',"is","sitting","on","the","mat"]]
	candidate = ["on",'the',"mat","is","a","cat"]
	score = sentence_bleu( reference, candidate)
	print(score)


	from nltk.translate.bleu_score import sentence_bleu
	reference = [['the', 'cat',"is","sitting","on","the","mat"]]
	candidate = ["there",'is',"cat","sitting","cat"]
	import numpy as np
	import pandas as pd
	from sklearn import datasets, linear_model

	def metrics(m,X,y):
	yhat = m.predict(X)
	print(yhat)
	SS_Residual = sum((y-yhat)**2)
	SS_Total = sum((y-np.mean(y))**2)
	r_squared = 1 - (float(SS_Residual))/SS_Total
	import catboost as cb
	cat_features_index = [0,1,2,3,4,5,6]

	def auc(m, train, test):
	return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
	metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

	params = {'depth': [4, 7, 10],
	'learning_rate' : [0.03, 0.1, 0.15],
	'l2_leaf_reg': [1,4,9],
	import pandas as pd, numpy as np, time
	from sklearn.model_selection import train_test_split

	data = pd.read_csv("flights.csv")
	data = data.sample(frac = 0.1, random_state=10)

	data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
	"ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
	data.dropna(inplace=True)