Skip to content

Instantly share code, notes, and snippets.

@snehankekre
Last active October 19, 2019 13:04
Show Gist options
  • Select an option

  • Save snehankekre/615fa35389fe12037a276550b8a56c7e to your computer and use it in GitHub Desktop.

Select an option

Save snehankekre/615fa35389fe12037a276550b8a56c7e to your computer and use it in GitHub Desktop.
CS112 Assignment 2
set.seed(42)
##############
# QUESTION 1 #
##############
library(tidyverse)
# Generating 998 observations
X <- rnorm(998)
Y <- -0.1 * X + 1 + rnorm(998, 0.05, 0.03)
data <- data.frame(Y,X)
#regression for the original 998
lm.data <- lm(Y~X, data = data)
summary(lm.data)
outliers <- data.frame( 'Y' = c(7,7), 'X' = c(15,12))
outliers_data <- rbind(data, outliers)
lm.outliers <- lm(Y~X, data = outliers_data)
summary(lm.outliers)
ggplot(data, aes(x = X, y = Y)) +
geom_smooth(aes(color = 'Initial Regression'), method = lm, se = FALSE) +
geom_smooth(aes(color = 'With Two Outliers'), data=outliers_data, method = lm, se = FALSE) +
labs(colour = 'Dataset', title = "Regression Models' Sensitivity to Outliers")
#-------------------------------------------------------------
##############
# QUESTION 2a #
##############
set.seed(42)
library(Matching)
data(lalonde)
library(arm)
lalonde$age.2 <- lalonde$age^2 / 100
lalonde$treat.age <- lalonde$treat * lalonde$age
# Fit model to training data
lm.lalonde <- lm(re78 ~ age + age.2 + educ + treat + treat.age + re74 + re75, data = lalonde)
sim.lalonde <- sim(lm.lalonde, n = 10000)
intervals <- function(coefs, values) {
expected_value <- coefs[1] + values[1]*coefs[2] +
values[2]*coefs[3] + values[3]*coefs[4] +
values[4]*coefs[5] + values[5]*coefs[6] +
values[6]*coefs[7] + values[7]*coefs[8]
return(expected_value)
}
prediction_int <- function(values){
vals <- matrix(NA, nrow = 39, ncol = 4)
for (i in 17:55){
vals[i-16,1] <- i
val_vec <- rep(0,10000)
values[1] <- i
values[2] <- (i^2)/100
values[5] <- i*values[4]
for (j in 1:10000){
val_vec[j] <- intervals(sim.lalonde@coef[j,], values)
}
vals[i-16,2] <- quantile(val_vec, prob = c(0.025,0.975))[1]
vals[i-16,3] <- quantile(val_vec, prob = c(0.025,0.975))[2]
vals[i-16,4] <- mean(val_vec)
}
return(vals)
}
mean_vals <- c(NA, NA, mean(lalonde$educ), 1, NA, mean(lalonde$re74), mean(lalonde$re75))
prediction_intervals <- prediction_int(mean_vals)
colnames(prediction_intervals) <- c('Age', '2.5%', '97.5%', 'Mean')
prediction_intervals <- subset(prediction_intervals, select = -c(Mean))
prediction_intervals<- cbind(prediction_intervals, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75))
colnames(prediction_intervals)[4] <- "mean.educ"
colnames(prediction_intervals)[5] <- "mean.re74"
colnames(prediction_intervals)[6] <- "mean.re75"
prediction_intervals
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(2000,15000),
main = "re78 for Treatement units", xlab = "Age",
ylab = "re78 (Prediction Interval)")
for (age in 17:55) {
segments(
x0 = age,
y0 = prediction_intervals[age-16, 2],
x1 = age,
y1 = prediction_intervals[age-16, 3],
lwd = 2)
}
# Same process as above for CONTROL units
##############
# QUESTION 2b #
##############
# Include control units
mean_vals_b <- c(NA, NA, mean(lalonde$educ), 0, NA, mean(lalonde$re74), mean(lalonde$re75))
prediction_intervals_b <- prediction_int(mean_vals_b)
colnames(prediction_intervals_b) <- c('Age', '2.5%', '97.5%', 'Mean')
prediction_intervals_b <- subset(prediction_intervals_b, select = -c(Mean))
prediction_intervals_b <- cbind(prediction_intervals_b, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75))
colnames(prediction_intervals_b)[4] <- "mean.educ"
colnames(prediction_intervals_b)[5] <- "mean.re74"
colnames(prediction_intervals_b)[6] <- "mean.re75"
prediction_intervals_b
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-2000,11000),
main = "re78 for Control units", xlab = "Age",
ylab = "re78")
for (age in 17:55) {
segments(
x0 = age,
y0 = prediction_intervals_b[age-16, 2],
x1 = age,
y1 = prediction_intervals_b[age-16, 3],
lwd = 2)
}
##############
# QUESTION 2c #
##############
prediction_int_ATE <- function(values){
#creates control and treated unit
treatment <- values
control <- values
control[4] <- 0
vars <- matrix(NA, nrow = 39, ncol = 4)
for (i in 17:55){
vars[i-16,1] <- i
val_vec <- rep(0,10000)
treatment[1] <- i
control[1] <- i
treatment[2] <- (i^2)/100
control[2] <- (i^2)/100
treatment[5] <- i*treatment[4]
control[5] <- i*control[4]
for (j in 1:10000){
val_vec[j] <- intervals(sim.lalonde@coef[j,], treatment) -
intervals(sim.lalonde@coef[j,], control)
}
vars[i-16,2] <- quantile(val_vec, prob = c(0.025,0.975))[1]
vars[i-16,3] <- quantile(val_vec, prob = c(0.025,0.975))[2]
vars[i-16,4] <- mean(val_vec)
}
return(vars)
}
prediction_int_c <- prediction_int_ATE(mean_vals)
colnames(prediction_int_c) <- c('Age', '2.5%', '97.5%', 'Mean')
prediction_int_c <- subset(prediction_int_c, select = -c(Mean))
prediction_int_c <- cbind(prediction_int_c, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75))
colnames(prediction_int_c)[4] <- "mean.educ"
colnames(prediction_int_c)[5] <- "mean.re74"
colnames(prediction_int_c)[6] <- "mean.re75"
prediction_int_c
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-1200,10000),
main = "Prediction Intervals of Treatment effect on re78", xlab = "Age",
ylab = "Prediction Intervals of Treatment effect on re78")
for (age in 17:55) {
segments(
x0 = age,
y0 = prediction_int_c[age-16, 2],
x1 = age,
y1 = prediction_int_c[age-16, 3],
lwd = 2)
}
##############
# QUESTION 2d #
##############
intervals_d <- function(coefs, sigma, values) {
expected_value <- coefs[1] + values[1]*coefs[2] +
values[2]*coefs[3] + values[3]*coefs[4] +
values[4]*coefs[5] + values[5]*coefs[6] +
values[6]*coefs[7] + values[7]*coefs[8] + rnorm(1,0,sigma)
return(expected_value)
}
prediction_int_sigma <- function(values){
treatment <- values
control <- values
control[4] <- 0
vals <- matrix(NA, nrow = 39, ncol = 4)
for (i in 17:55){
vals[i-16,1] <- i
vals_vec <- rep(0,10000)
treatment[1] <- i
control[1] <- i
treatment[2] <- (i^2)/100
control[2] <- (i^2)/100
treatment[5] <- i*treatment[4]
control[5] <- i*control[4]
for (j in 1:10000){
vals_vec[j] <- intervals_d(sim.lalonde@coef[j,], sim.lalonde@sigma[j], treatment) - intervals_d(sim.lalonde@coef[j,], sim.lalonde@sigma[j], control)
}
vals[i-16,2] <- quantile(vals_vec, prob = c(0.025,0.975))[1]
vals[i-16,3] <- quantile(vals_vec, prob = c(0.025,0.975))[2]
vals[i-16,4] <- median(vals_vec)
}
return(vals)
}
mean_vals_d <- c(NA, NA, median(lalonde$educ), 1, NA, median(lalonde$re74), median(lalonde$re75))
prediction_int_d <- prediction_int_sigma(mean_vals_d)
colnames(prediction_int_d) <- c('Age','2.5%','97.5%','Median')
prediction_int_d <- subset(prediction_int_d, select = -c(Median))
prediction_int_d <- cbind(prediction_int_d, median(lalonde$educ), median(lalonde$re74), median(lalonde$re75))
colnames(prediction_int_d)[4] <- "median.educ"
colnames(prediction_int_d)[5] <- "median.re74"
colnames(prediction_int_d)[6] <- "median.re75"
prediction_int_d
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-20000,25000),
main = "Prediction Interval Treatment Effect re78", xlab = "Age",
ylab = "Prediction Intervals of Treatment effect on re78")
for (age in 17:55) {
segments(
x0 = age,
y0 = prediction_int_d[age-16, 2],
x1 = age,
y1 = prediction_int_d[age-16, 3],
lwd = 2)
}
#-------------------------------------------------------------
##############
# QUESTION 3a #
##############
library(boot)
foo <- read.csv("y2prc9xq.csv")
boot.fn = function(data, index){coef(lm(MATH_SCORE ~ as.numeric(I(TREATMENT)), data = foo, subset = index))}
coefs = boot(foo, boot.fn, 100000)$t[,2]
bootstrap = quantile(coefs, probs = c(0.025, 0.975))
analytical = confint(lm(MATH_SCORE ~ as.numeric(I(TREATMENT)), data = foo))[2,]
rbind(bootstrap, analytical)
summary(coefs)
##############
# QUESTION 3b #
##############
hist(coefs, breaks = 30,
main = "100000 Boostrap Estimates of Treatment Coefficient",
xlab = "Estimates of Coefficient")
#-------------------------------------------------------------
##############
# QUESTION 4 #
##############
r_squared <- function(y_pred, ry_true) {
r2 <- NA
for (i in (1:10000) ){
pos <- sample(1:length(y_true), length(y_true), replace = TRUE)
r2[i] <- 1- sum((y_true[pos]-y_pred[pos])^2) / sum((y_true[pos]-mean(y_true[pos]))^2)
}
return(mean(r2))
}
lm.boot <- lm(MATH_SCORE ~ TREATMENT, data=foo)
y_true <- foo$MATH_SCORE[-which(is.na(foo$MATH_SCORE))]
r_squared(predict(lm.boot), y_true)
summary(lm(MATH_SCORE ~ TREATMENT, data=foo))$r.squared
#---------------------------------------------------------------
##############
# QUESTION 5 #
##############
set.seed(42)
nsw.dta <- read.csv("nsw.csv")
rows <- sample(1:length(nsw.dta$age), 2000, replace = FALSE)
test <- nsw.dta[rows,]
train <- nsw.dta[-rows,]
model_1 <- glm(treat ~.-re78, data = train)
model_2 <- glm(treat ~ age + age^2 + education, data = train)
model_3 <- glm(treat ~ age + age^2 + education + nodegree + re74 + re75, data = train)
model_4 <- glm(treat ~ age + age^2 + education + nodegree +
re74 + re75 + nodegree*re74 + nodegree*re75, data = train)
model_5 <- glm(treat ~ age + age^2 + education + nodegree +
re74 + re75 + nodegree*re74 + nodegree*re75 + u74 + u75, data = train)
library(boot)
cv_error1 <- cv.glm(train, model_1)
cv_error2 <- cv.glm(train, model_2)
cv_error3 <- cv.glm(train, model_3)
cv_error4 <- cv.glm(train, model_4)
cv_error5 <- cv.glm(train, model_5)
cv_error1$delta[1]
cv_error2$delta[1]
cv_error3$delta[1]
cv_error4$delta[1]
cv_error5$delta[1]
y_1 <- predict(model_1, data = test)
mse_1 <- mean((y_1-train$treat)^2)
y_2 <- predict(model_2, data = test)
mse_2 <- mean((y_2-train$treat)^2)
y_3 <- predict(model_3, data = test)
mse_3 <- mean((y_3-train$treat)^2)
y_4 <- predict(model_4, data = test)
mse_4 <- mean((y_4-train$treat)^2)
y_5 <- predict(model_5, data = test)
mse_5 <- mean((y_5-train$treat)^2)
#saving results in a data frame
LOOCV <- data.frame("LOOCV MSE" = c(cv_error1$delta[1],
cv_error2$delta[1],
cv_error3$delta[1],
cv_error4$delta[1]
,cv_error5$delta[1]),
"Test MSE" = c(mse_1,
mse_2,
mse_3,
mse_4,
mse_5))
LOOCV
#----------------------------------------------------------------
##############
# QUESTION 6 #
##############
library("rgenoud")
library("DAAG")
df <- lalonde[, -which(names(lalonde) == 're78')]
df <- cbind(df,
I(df$age)^2,
I(df$educ)^2,
I(df$nodegr)^2,
I(df$re74)^2,
I(df$re75)^2
)
final_df <- cbind(df, lalonde$re78)
names(final_df)[17] <- "re78"
model_selection <- function(features) {
feature_indices <- cbind(final_df[,c(17, unique(features))])
lm1 <- CVlm(re78 ~ ., data = feature_indices, plotit = FALSE, m=10) # 10-fold CV
# attributes(CVlm) returns cross-validated sum of MSE over all the folds, and also returns degrees of freedom
# RSS = DF * MSE. Also normalize over the number of folds
# return cross-validated RSS
return(((attributes(lm1)$ms)/10) * (attributes(lm1)$df))
}
GenOpt <- genoud(fn = model_selection, nvars = 16, data.type.int = TRUE,
Domains = matrix(c(rep(1, 16), rep(16, 16)), nrow = 16, ncol = 2),
max = FALSE, pop.size = 100, max.generations = 10,
wait.generations = 6, boundary.enforcement = 2)
sort(unique(GenOpt$par))
gen_data <- final_df[c(1,3,6,7,11,16, 17)]
final.lm <- lm(re78 ~ ., data = gen_data)
sum((summary(final.lm)$residuals)^2)
#------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment