snehankekre/CS112_Regression_and_Bootstrapping.R

## CS112_Regression_and_Bootstrapping.R
set.seed(42)

##############
# QUESTION 1 #
##############


library(tidyverse)

# Generating 998 observations
X <- rnorm(998)
Y <- -0.1 * X + 1 + rnorm(998, 0.05, 0.03)

data <- data.frame(Y,X)

#regression for the original 998
lm.data <- lm(Y~X, data = data)
summary(lm.data)

outliers <- data.frame( 'Y' = c(7,7), 'X' = c(15,12))

outliers_data <- rbind(data, outliers)

lm.outliers <- lm(Y~X, data = outliers_data)
summary(lm.outliers)

ggplot(data, aes(x = X, y = Y)) +
  geom_smooth(aes(color = 'Initial Regression'), method = lm, se = FALSE) +
  geom_smooth(aes(color = 'With Two Outliers'), data=outliers_data, method = lm, se = FALSE) +
  labs(colour = 'Dataset', title = "Regression Models' Sensitivity to Outliers")
#-------------------------------------------------------------


##############
# QUESTION 2a #
##############
set.seed(42)

library(Matching)
data(lalonde)
library(arm)


lalonde$age.2 <- lalonde$age^2 / 100
lalonde$treat.age <- lalonde$treat * lalonde$age

# Fit model to training data
lm.lalonde <- lm(re78 ~ age + age.2 + educ + treat + treat.age + re74 + re75, data = lalonde)
sim.lalonde <- sim(lm.lalonde, n = 10000)


intervals <- function(coefs, values) {
  expected_value <- coefs[1] + values[1]*coefs[2] +
    values[2]*coefs[3] + values[3]*coefs[4] +
    values[4]*coefs[5] + values[5]*coefs[6] +
    values[6]*coefs[7] + values[7]*coefs[8]
  return(expected_value)
}

prediction_int <- function(values){
  vals <- matrix(NA, nrow = 39, ncol = 4)
  for (i in 17:55){
    vals[i-16,1] <- i
    val_vec <- rep(0,10000)
    values[1] <- i
    values[2] <- (i^2)/100
    values[5] <- i*values[4]
    for (j in 1:10000){
      val_vec[j] <- intervals(sim.lalonde@coef[j,], values)
    }
    vals[i-16,2] <- quantile(val_vec, prob = c(0.025,0.975))[1]
    vals[i-16,3] <- quantile(val_vec, prob = c(0.025,0.975))[2]
    vals[i-16,4] <- mean(val_vec)
  }
  return(vals)
}

mean_vals <- c(NA, NA, mean(lalonde$educ), 1, NA, mean(lalonde$re74), mean(lalonde$re75))


prediction_intervals <- prediction_int(mean_vals)
colnames(prediction_intervals) <- c('Age', '2.5%', '97.5%', 'Mean')
prediction_intervals <- subset(prediction_intervals, select = -c(Mean))
prediction_intervals<- cbind(prediction_intervals, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75))
colnames(prediction_intervals)[4] <- "mean.educ"
colnames(prediction_intervals)[5] <- "mean.re74"
colnames(prediction_intervals)[6] <- "mean.re75"
prediction_intervals

plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(2000,15000),
     main = "re78 for Treatement units", xlab = "Age",
     ylab = "re78 (Prediction Interval)")

for (age in 17:55) {
  segments(
    x0 = age,
    y0 = prediction_intervals[age-16, 2],
    x1 = age,
    y1 = prediction_intervals[age-16, 3],
    lwd = 2)
}
# Same process as above for CONTROL units

##############
# QUESTION 2b #
##############
# Include control units
mean_vals_b <- c(NA, NA, mean(lalonde$educ), 0, NA, mean(lalonde$re74), mean(lalonde$re75))

prediction_intervals_b <- prediction_int(mean_vals_b)
colnames(prediction_intervals_b) <- c('Age', '2.5%', '97.5%', 'Mean')
prediction_intervals_b <- subset(prediction_intervals_b, select = -c(Mean))
prediction_intervals_b <- cbind(prediction_intervals_b, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75))
colnames(prediction_intervals_b)[4] <- "mean.educ"
colnames(prediction_intervals_b)[5] <- "mean.re74"
colnames(prediction_intervals_b)[6] <- "mean.re75"
prediction_intervals_b

plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-2000,11000),
     main = "re78 for Control units", xlab = "Age",
     ylab = "re78")

for (age in 17:55) {
  segments(
    x0 = age,
    y0 = prediction_intervals_b[age-16, 2],
    x1 = age,
    y1 = prediction_intervals_b[age-16, 3],
    lwd = 2)
}


##############
# QUESTION 2c #
##############

prediction_int_ATE <- function(values){
  #creates control and treated unit
  treatment <- values
  control <- values
  control[4] <- 0
  vars <- matrix(NA, nrow = 39, ncol = 4)
  for (i in 17:55){
    vars[i-16,1] <- i
    val_vec <- rep(0,10000)
    treatment[1] <- i
    control[1] <- i
    treatment[2] <- (i^2)/100
    control[2] <- (i^2)/100
    treatment[5] <- i*treatment[4]
    control[5] <- i*control[4]
    for (j in 1:10000){
      val_vec[j] <- intervals(sim.lalonde@coef[j,], treatment) -
        intervals(sim.lalonde@coef[j,], control)
    }
    vars[i-16,2] <- quantile(val_vec, prob = c(0.025,0.975))[1]
    vars[i-16,3] <- quantile(val_vec, prob = c(0.025,0.975))[2]
    vars[i-16,4] <- mean(val_vec)
  }
  return(vars)
}

prediction_int_c <- prediction_int_ATE(mean_vals)
colnames(prediction_int_c) <- c('Age', '2.5%', '97.5%', 'Mean')
prediction_int_c <- subset(prediction_int_c, select = -c(Mean))
prediction_int_c <- cbind(prediction_int_c, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75))
colnames(prediction_int_c)[4] <- "mean.educ"
colnames(prediction_int_c)[5] <- "mean.re74"
colnames(prediction_int_c)[6] <- "mean.re75"
prediction_int_c

plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-1200,10000),
     main = "Prediction Intervals of Treatment effect on re78", xlab = "Age",
     ylab = "Prediction Intervals of Treatment effect on re78")

for (age in 17:55) {
  segments(
    x0 = age,
    y0 = prediction_int_c[age-16, 2],
    x1 = age,
    y1 = prediction_int_c[age-16, 3],
    lwd = 2)
}

##############
# QUESTION 2d #
##############

intervals_d <- function(coefs, sigma, values) {
  expected_value <- coefs[1] + values[1]*coefs[2] +
    values[2]*coefs[3] + values[3]*coefs[4] +
    values[4]*coefs[5] + values[5]*coefs[6] +
    values[6]*coefs[7] + values[7]*coefs[8] + rnorm(1,0,sigma)
  return(expected_value)
}


prediction_int_sigma <- function(values){
  treatment <- values
  control <- values
  control[4] <- 0
  vals <- matrix(NA, nrow = 39, ncol = 4)
  for (i in 17:55){
    vals[i-16,1] <- i
    vals_vec <- rep(0,10000)
    treatment[1] <- i
    control[1] <- i
    treatment[2] <- (i^2)/100
    control[2] <- (i^2)/100
    treatment[5] <- i*treatment[4]
    control[5] <- i*control[4]
    for (j in 1:10000){
      vals_vec[j] <- intervals_d(sim.lalonde@coef[j,], sim.lalonde@sigma[j], treatment) - intervals_d(sim.lalonde@coef[j,], sim.lalonde@sigma[j], control)
    }
    vals[i-16,2] <- quantile(vals_vec, prob = c(0.025,0.975))[1]
    vals[i-16,3] <- quantile(vals_vec, prob = c(0.025,0.975))[2]
    vals[i-16,4] <- median(vals_vec)
  }
  return(vals)
}

mean_vals_d <- c(NA, NA, median(lalonde$educ), 1, NA, median(lalonde$re74), median(lalonde$re75))

prediction_int_d <- prediction_int_sigma(mean_vals_d)
colnames(prediction_int_d)  <- c('Age','2.5%','97.5%','Median')
prediction_int_d <- subset(prediction_int_d, select = -c(Median))
prediction_int_d <- cbind(prediction_int_d, median(lalonde$educ), median(lalonde$re74), median(lalonde$re75))
colnames(prediction_int_d)[4] <- "median.educ"
colnames(prediction_int_d)[5] <- "median.re74"
colnames(prediction_int_d)[6] <- "median.re75"
prediction_int_d

plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-20000,25000),
     main = "Prediction Interval Treatment Effect re78", xlab = "Age",
     ylab = "Prediction Intervals of Treatment effect on re78")

for (age in 17:55) {
  segments(
    x0 = age,
    y0 = prediction_int_d[age-16, 2],
    x1 = age,
    y1 = prediction_int_d[age-16, 3],
    lwd = 2)
}
#-------------------------------------------------------------


##############
# QUESTION 3a #
##############
library(boot)
foo <- read.csv("y2prc9xq.csv")
boot.fn = function(data, index){coef(lm(MATH_SCORE ~ as.numeric(I(TREATMENT)), data = foo, subset = index))}
coefs = boot(foo, boot.fn, 100000)$t[,2]

bootstrap = quantile(coefs, probs = c(0.025, 0.975))
analytical = confint(lm(MATH_SCORE ~ as.numeric(I(TREATMENT)), data = foo))[2,]
rbind(bootstrap, analytical)
summary(coefs)


##############
# QUESTION 3b #
##############
hist(coefs, breaks = 30,
     main = "100000 Boostrap Estimates of Treatment Coefficient",
     xlab = "Estimates of Coefficient")

#-------------------------------------------------------------


##############
# QUESTION 4 #
##############
r_squared <- function(y_pred, ry_true) {
  r2 <- NA
  for (i in (1:10000) ){
    pos <- sample(1:length(y_true), length(y_true), replace = TRUE)
    r2[i] <- 1- sum((y_true[pos]-y_pred[pos])^2) / sum((y_true[pos]-mean(y_true[pos]))^2)
  }
  return(mean(r2))
}

lm.boot <- lm(MATH_SCORE ~ TREATMENT, data=foo)
y_true <- foo$MATH_SCORE[-which(is.na(foo$MATH_SCORE))]
r_squared(predict(lm.boot), y_true)
summary(lm(MATH_SCORE ~ TREATMENT, data=foo))$r.squared

#---------------------------------------------------------------

##############
# QUESTION 5 #
##############
set.seed(42)
nsw.dta <- read.csv("nsw.csv")


rows <- sample(1:length(nsw.dta$age), 2000, replace = FALSE)

test <- nsw.dta[rows,]
train <- nsw.dta[-rows,]


model_1 <- glm(treat ~.-re78, data = train)
model_2 <- glm(treat ~ age + age^2 + education, data = train)
model_3 <- glm(treat ~ age + age^2 + education + nodegree + re74 + re75, data = train)
model_4 <- glm(treat ~ age + age^2 + education + nodegree +
                  re74 + re75 + nodegree*re74 + nodegree*re75, data = train)
model_5 <- glm(treat ~ age + age^2 + education + nodegree +
                  re74 + re75 + nodegree*re74 + nodegree*re75 + u74 + u75, data = train)


library(boot)

cv_error1 <- cv.glm(train, model_1)
cv_error2 <- cv.glm(train, model_2)
cv_error3 <- cv.glm(train, model_3)
cv_error4 <- cv.glm(train, model_4)
cv_error5 <- cv.glm(train, model_5)

cv_error1$delta[1]
cv_error2$delta[1]
cv_error3$delta[1]
cv_error4$delta[1]
cv_error5$delta[1]

y_1 <- predict(model_1, data = test)
mse_1 <- mean((y_1-train$treat)^2)
y_2 <- predict(model_2, data = test)
mse_2 <- mean((y_2-train$treat)^2)
y_3 <- predict(model_3, data = test)
mse_3 <- mean((y_3-train$treat)^2)
y_4 <- predict(model_4, data = test)
mse_4 <- mean((y_4-train$treat)^2)
y_5 <- predict(model_5, data = test)
mse_5 <- mean((y_5-train$treat)^2)

#saving results in a data frame
LOOCV <- data.frame("LOOCV MSE" = c(cv_error1$delta[1],
                                     cv_error2$delta[1],
                                     cv_error3$delta[1],
                                     cv_error4$delta[1]
                                     ,cv_error5$delta[1]),
                    "Test MSE" = c(mse_1,
                                   mse_2,
                                   mse_3,
                                   mse_4,
                                   mse_5))

LOOCV

#----------------------------------------------------------------

##############
# QUESTION 6 #
##############
library("rgenoud")
library("DAAG")

df <- lalonde[, -which(names(lalonde) == 're78')]
df <- cbind(df,
            I(df$age)^2,
            I(df$educ)^2,
            I(df$nodegr)^2,
            I(df$re74)^2,
            I(df$re75)^2
            )

final_df <- cbind(df, lalonde$re78)

names(final_df)[17] <- "re78"

model_selection <- function(features) {

  feature_indices <- cbind(final_df[,c(17, unique(features))])
  lm1 <- CVlm(re78 ~ ., data = feature_indices, plotit = FALSE, m=10) # 10-fold CV

  # attributes(CVlm) returns cross-validated sum of MSE over all the folds, and also returns degrees of freedom
  # RSS = DF * MSE. Also normalize over the number of folds
  # return cross-validated RSS
  return(((attributes(lm1)$ms)/10) * (attributes(lm1)$df))
}

GenOpt <- genoud(fn = model_selection, nvars = 16, data.type.int = TRUE,
                 Domains = matrix(c(rep(1, 16), rep(16, 16)), nrow = 16, ncol = 2),
                 max = FALSE, pop.size = 100, max.generations = 10,
                 wait.generations = 6, boundary.enforcement = 2)

sort(unique(GenOpt$par))
gen_data <- final_df[c(1,3,6,7,11,16, 17)]

final.lm <- lm(re78 ~ ., data = gen_data)
sum((summary(final.lm)$residuals)^2)

#------------------------------------------------------
	set.seed(42)

	##############
	# QUESTION 1 #
	##############


	library(tidyverse)

	# Generating 998 observations
	X <- rnorm(998)
	Y <- -0.1 * X + 1 + rnorm(998, 0.05, 0.03)

	data <- data.frame(Y,X)

	#regression for the original 998
	lm.data <- lm(Y~X, data = data)
	summary(lm.data)

	outliers <- data.frame( 'Y' = c(7,7), 'X' = c(15,12))

	outliers_data <- rbind(data, outliers)

	lm.outliers <- lm(Y~X, data = outliers_data)
	summary(lm.outliers)

	ggplot(data, aes(x = X, y = Y)) +
	geom_smooth(aes(color = 'Initial Regression'), method = lm, se = FALSE) +
	geom_smooth(aes(color = 'With Two Outliers'), data=outliers_data, method = lm, se = FALSE) +
	labs(colour = 'Dataset', title = "Regression Models' Sensitivity to Outliers")
	#-------------------------------------------------------------


	##############
	# QUESTION 2a #
	##############
	set.seed(42)

	library(Matching)
	data(lalonde)
	library(arm)


	lalonde$age.2 <- lalonde$age^2 / 100
	lalonde$treat.age <- lalonde$treat * lalonde$age

	# Fit model to training data
	lm.lalonde <- lm(re78 ~ age + age.2 + educ + treat + treat.age + re74 + re75, data = lalonde)
	sim.lalonde <- sim(lm.lalonde, n = 10000)


	intervals <- function(coefs, values) {
	expected_value <- coefs[1] + values[1]*coefs[2] +
	values[2]coefs[3] + values[3]coefs[4] +
	values[4]coefs[5] + values[5]coefs[6] +
	values[6]coefs[7] + values[7]coefs[8]
	return(expected_value)
	}

	prediction_int <- function(values){
	vals <- matrix(NA, nrow = 39, ncol = 4)
	for (i in 17:55){
	vals[i-16,1] <- i
	val_vec <- rep(0,10000)
	values[1] <- i
	values[2] <- (i^2)/100
	values[5] <- i*values[4]
	for (j in 1:10000){
	val_vec[j] <- intervals(sim.lalonde@coef[j,], values)
	}
	vals[i-16,2] <- quantile(val_vec, prob = c(0.025,0.975))[1]
	vals[i-16,3] <- quantile(val_vec, prob = c(0.025,0.975))[2]
	vals[i-16,4] <- mean(val_vec)
	}
	return(vals)
	}

	mean_vals <- c(NA, NA, mean(lalonde$educ), 1, NA, mean(lalonde$re74), mean(lalonde$re75))


	prediction_intervals <- prediction_int(mean_vals)
	colnames(prediction_intervals) <- c('Age', '2.5%', '97.5%', 'Mean')
	prediction_intervals <- subset(prediction_intervals, select = -c(Mean))
	prediction_intervals<- cbind(prediction_intervals, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75))
	colnames(prediction_intervals)[4] <- "mean.educ"
	colnames(prediction_intervals)[5] <- "mean.re74"
	colnames(prediction_intervals)[6] <- "mean.re75"
	prediction_intervals

	plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(2000,15000),
	main = "re78 for Treatement units", xlab = "Age",
	ylab = "re78 (Prediction Interval)")

	for (age in 17:55) {
	segments(
	x0 = age,
	y0 = prediction_intervals[age-16, 2],
	x1 = age,
	y1 = prediction_intervals[age-16, 3],
	lwd = 2)
	}
	# Same process as above for CONTROL units

	##############
	# QUESTION 2b #
	##############
	# Include control units
	mean_vals_b <- c(NA, NA, mean(lalonde$educ), 0, NA, mean(lalonde$re74), mean(lalonde$re75))

	prediction_intervals_b <- prediction_int(mean_vals_b)
	colnames(prediction_intervals_b) <- c('Age', '2.5%', '97.5%', 'Mean')
	prediction_intervals_b <- subset(prediction_intervals_b, select = -c(Mean))
	prediction_intervals_b <- cbind(prediction_intervals_b, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75))
	colnames(prediction_intervals_b)[4] <- "mean.educ"
	colnames(prediction_intervals_b)[5] <- "mean.re74"
	colnames(prediction_intervals_b)[6] <- "mean.re75"
	prediction_intervals_b

	plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-2000,11000),
	main = "re78 for Control units", xlab = "Age",
	ylab = "re78")

	for (age in 17:55) {
	segments(
	x0 = age,
	y0 = prediction_intervals_b[age-16, 2],
	x1 = age,
	y1 = prediction_intervals_b[age-16, 3],
	lwd = 2)
	}



	##############
	# QUESTION 2c #
	##############

	prediction_int_ATE <- function(values){
	#creates control and treated unit
	treatment <- values
	control <- values
	control[4] <- 0
	vars <- matrix(NA, nrow = 39, ncol = 4)
	for (i in 17:55){
	vars[i-16,1] <- i
	val_vec <- rep(0,10000)
	treatment[1] <- i
	control[1] <- i
	treatment[2] <- (i^2)/100
	control[2] <- (i^2)/100
	treatment[5] <- i*treatment[4]
	control[5] <- i*control[4]
	for (j in 1:10000){
	val_vec[j] <- intervals(sim.lalonde@coef[j,], treatment) -
	intervals(sim.lalonde@coef[j,], control)
	}
	vars[i-16,2] <- quantile(val_vec, prob = c(0.025,0.975))[1]
	vars[i-16,3] <- quantile(val_vec, prob = c(0.025,0.975))[2]
	vars[i-16,4] <- mean(val_vec)
	}
	return(vars)
	}

	prediction_int_c <- prediction_int_ATE(mean_vals)
	colnames(prediction_int_c) <- c('Age', '2.5%', '97.5%', 'Mean')
	prediction_int_c <- subset(prediction_int_c, select = -c(Mean))
	prediction_int_c <- cbind(prediction_int_c, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75))
	colnames(prediction_int_c)[4] <- "mean.educ"
	colnames(prediction_int_c)[5] <- "mean.re74"
	colnames(prediction_int_c)[6] <- "mean.re75"
	prediction_int_c

	plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-1200,10000),
	main = "Prediction Intervals of Treatment effect on re78", xlab = "Age",
	ylab = "Prediction Intervals of Treatment effect on re78")

	for (age in 17:55) {
	segments(
	x0 = age,
	y0 = prediction_int_c[age-16, 2],
	x1 = age,
	y1 = prediction_int_c[age-16, 3],
	lwd = 2)
	}

	##############
	# QUESTION 2d #
	##############

	intervals_d <- function(coefs, sigma, values) {
	expected_value <- coefs[1] + values[1]*coefs[2] +
	values[2]coefs[3] + values[3]coefs[4] +
	values[4]coefs[5] + values[5]coefs[6] +
	values[6]coefs[7] + values[7]coefs[8] + rnorm(1,0,sigma)
	return(expected_value)
	}


	prediction_int_sigma <- function(values){
	treatment <- values
	control <- values
	control[4] <- 0
	vals <- matrix(NA, nrow = 39, ncol = 4)
	for (i in 17:55){
	vals[i-16,1] <- i
	vals_vec <- rep(0,10000)
	treatment[1] <- i
	control[1] <- i
	treatment[2] <- (i^2)/100
	control[2] <- (i^2)/100
	treatment[5] <- i*treatment[4]
	control[5] <- i*control[4]
	for (j in 1:10000){
	vals_vec[j] <- intervals_d(sim.lalonde@coef[j,], sim.lalonde@sigma[j], treatment) - intervals_d(sim.lalonde@coef[j,], sim.lalonde@sigma[j], control)
	}
	vals[i-16,2] <- quantile(vals_vec, prob = c(0.025,0.975))[1]
	vals[i-16,3] <- quantile(vals_vec, prob = c(0.025,0.975))[2]
	vals[i-16,4] <- median(vals_vec)
	}
	return(vals)
	}

	mean_vals_d <- c(NA, NA, median(lalonde$educ), 1, NA, median(lalonde$re74), median(lalonde$re75))

	prediction_int_d <- prediction_int_sigma(mean_vals_d)
	colnames(prediction_int_d) <- c('Age','2.5%','97.5%','Median')
	prediction_int_d <- subset(prediction_int_d, select = -c(Median))
	prediction_int_d <- cbind(prediction_int_d, median(lalonde$educ), median(lalonde$re74), median(lalonde$re75))
	colnames(prediction_int_d)[4] <- "median.educ"
	colnames(prediction_int_d)[5] <- "median.re74"
	colnames(prediction_int_d)[6] <- "median.re75"
	prediction_int_d

	plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-20000,25000),
	main = "Prediction Interval Treatment Effect re78", xlab = "Age",
	ylab = "Prediction Intervals of Treatment effect on re78")

	for (age in 17:55) {
	segments(
	x0 = age,
	y0 = prediction_int_d[age-16, 2],
	x1 = age,
	y1 = prediction_int_d[age-16, 3],
	lwd = 2)
	}
	#-------------------------------------------------------------



	##############
	# QUESTION 3a #
	##############
	library(boot)
	foo <- read.csv("y2prc9xq.csv")
	boot.fn = function(data, index){coef(lm(MATH_SCORE ~ as.numeric(I(TREATMENT)), data = foo, subset = index))}
	coefs = boot(foo, boot.fn, 100000)$t[,2]

	bootstrap = quantile(coefs, probs = c(0.025, 0.975))
	analytical = confint(lm(MATH_SCORE ~ as.numeric(I(TREATMENT)), data = foo))[2,]
	rbind(bootstrap, analytical)
	summary(coefs)


	##############
	# QUESTION 3b #
	##############
	hist(coefs, breaks = 30,
	main = "100000 Boostrap Estimates of Treatment Coefficient",
	xlab = "Estimates of Coefficient")

	#-------------------------------------------------------------


	##############
	# QUESTION 4 #
	##############
	r_squared <- function(y_pred, ry_true) {
	r2 <- NA
	for (i in (1:10000) ){
	pos <- sample(1:length(y_true), length(y_true), replace = TRUE)
	r2[i] <- 1- sum((y_true[pos]-y_pred[pos])^2) / sum((y_true[pos]-mean(y_true[pos]))^2)
	}
	return(mean(r2))
	}

	lm.boot <- lm(MATH_SCORE ~ TREATMENT, data=foo)
	y_true <- foo$MATH_SCORE[-which(is.na(foo$MATH_SCORE))]
	r_squared(predict(lm.boot), y_true)
	summary(lm(MATH_SCORE ~ TREATMENT, data=foo))$r.squared

	#---------------------------------------------------------------

	##############
	# QUESTION 5 #
	##############
	set.seed(42)
	nsw.dta <- read.csv("nsw.csv")


	rows <- sample(1:length(nsw.dta$age), 2000, replace = FALSE)

	test <- nsw.dta[rows,]
	train <- nsw.dta[-rows,]


	model_1 <- glm(treat ~.-re78, data = train)
	model_2 <- glm(treat ~ age + age^2 + education, data = train)
	model_3 <- glm(treat ~ age + age^2 + education + nodegree + re74 + re75, data = train)
	model_4 <- glm(treat ~ age + age^2 + education + nodegree +
	re74 + re75 + nodegreere74 + nodegreere75, data = train)
	model_5 <- glm(treat ~ age + age^2 + education + nodegree +
	re74 + re75 + nodegreere74 + nodegreere75 + u74 + u75, data = train)


	library(boot)

	cv_error1 <- cv.glm(train, model_1)
	cv_error2 <- cv.glm(train, model_2)
	cv_error3 <- cv.glm(train, model_3)
	cv_error4 <- cv.glm(train, model_4)
	cv_error5 <- cv.glm(train, model_5)

	cv_error1$delta[1]
	cv_error2$delta[1]
	cv_error3$delta[1]
	cv_error4$delta[1]
	cv_error5$delta[1]

	y_1 <- predict(model_1, data = test)
	mse_1 <- mean((y_1-train$treat)^2)
	y_2 <- predict(model_2, data = test)
	mse_2 <- mean((y_2-train$treat)^2)
	y_3 <- predict(model_3, data = test)
	mse_3 <- mean((y_3-train$treat)^2)
	y_4 <- predict(model_4, data = test)
	mse_4 <- mean((y_4-train$treat)^2)
	y_5 <- predict(model_5, data = test)
	mse_5 <- mean((y_5-train$treat)^2)

	#saving results in a data frame
	LOOCV <- data.frame("LOOCV MSE" = c(cv_error1$delta[1],
	cv_error2$delta[1],
	cv_error3$delta[1],
	cv_error4$delta[1]
	,cv_error5$delta[1]),
	"Test MSE" = c(mse_1,
	mse_2,
	mse_3,
	mse_4,
	mse_5))

	LOOCV

	#----------------------------------------------------------------

	##############
	# QUESTION 6 #
	##############
	library("rgenoud")
	library("DAAG")

	df <- lalonde[, -which(names(lalonde) == 're78')]
	df <- cbind(df,
	I(df$age)^2,
	I(df$educ)^2,
	I(df$nodegr)^2,
	I(df$re74)^2,
	I(df$re75)^2
	)

	final_df <- cbind(df, lalonde$re78)

	names(final_df)[17] <- "re78"

	model_selection <- function(features) {

	feature_indices <- cbind(final_df[,c(17, unique(features))])
	lm1 <- CVlm(re78 ~ ., data = feature_indices, plotit = FALSE, m=10) # 10-fold CV

	# attributes(CVlm) returns cross-validated sum of MSE over all the folds, and also returns degrees of freedom
	# RSS = DF * MSE. Also normalize over the number of folds
	# return cross-validated RSS
	return(((attributes(lm1)$ms)/10) * (attributes(lm1)$df))
	}

	GenOpt <- genoud(fn = model_selection, nvars = 16, data.type.int = TRUE,
	Domains = matrix(c(rep(1, 16), rep(16, 16)), nrow = 16, ncol = 2),
	max = FALSE, pop.size = 100, max.generations = 10,
	wait.generations = 6, boundary.enforcement = 2)

	sort(unique(GenOpt$par))
	gen_data <- final_df[c(1,3,6,7,11,16, 17)]

	final.lm <- lm(re78 ~ ., data = gen_data)
	sum((summary(final.lm)$residuals)^2)

	#------------------------------------------------------
No results found