Last active
October 19, 2019 13:04
-
-
Save snehankekre/615fa35389fe12037a276550b8a56c7e to your computer and use it in GitHub Desktop.
CS112 Assignment 2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| set.seed(42) | |
| ############## | |
| # QUESTION 1 # | |
| ############## | |
| library(tidyverse) | |
| # Generating 998 observations | |
| X <- rnorm(998) | |
| Y <- -0.1 * X + 1 + rnorm(998, 0.05, 0.03) | |
| data <- data.frame(Y,X) | |
| #regression for the original 998 | |
| lm.data <- lm(Y~X, data = data) | |
| summary(lm.data) | |
| outliers <- data.frame( 'Y' = c(7,7), 'X' = c(15,12)) | |
| outliers_data <- rbind(data, outliers) | |
| lm.outliers <- lm(Y~X, data = outliers_data) | |
| summary(lm.outliers) | |
| ggplot(data, aes(x = X, y = Y)) + | |
| geom_smooth(aes(color = 'Initial Regression'), method = lm, se = FALSE) + | |
| geom_smooth(aes(color = 'With Two Outliers'), data=outliers_data, method = lm, se = FALSE) + | |
| labs(colour = 'Dataset', title = "Regression Models' Sensitivity to Outliers") | |
| #------------------------------------------------------------- | |
| ############## | |
| # QUESTION 2a # | |
| ############## | |
| set.seed(42) | |
| library(Matching) | |
| data(lalonde) | |
| library(arm) | |
| lalonde$age.2 <- lalonde$age^2 / 100 | |
| lalonde$treat.age <- lalonde$treat * lalonde$age | |
| # Fit model to training data | |
| lm.lalonde <- lm(re78 ~ age + age.2 + educ + treat + treat.age + re74 + re75, data = lalonde) | |
| sim.lalonde <- sim(lm.lalonde, n = 10000) | |
| intervals <- function(coefs, values) { | |
| expected_value <- coefs[1] + values[1]*coefs[2] + | |
| values[2]*coefs[3] + values[3]*coefs[4] + | |
| values[4]*coefs[5] + values[5]*coefs[6] + | |
| values[6]*coefs[7] + values[7]*coefs[8] | |
| return(expected_value) | |
| } | |
| prediction_int <- function(values){ | |
| vals <- matrix(NA, nrow = 39, ncol = 4) | |
| for (i in 17:55){ | |
| vals[i-16,1] <- i | |
| val_vec <- rep(0,10000) | |
| values[1] <- i | |
| values[2] <- (i^2)/100 | |
| values[5] <- i*values[4] | |
| for (j in 1:10000){ | |
| val_vec[j] <- intervals(sim.lalonde@coef[j,], values) | |
| } | |
| vals[i-16,2] <- quantile(val_vec, prob = c(0.025,0.975))[1] | |
| vals[i-16,3] <- quantile(val_vec, prob = c(0.025,0.975))[2] | |
| vals[i-16,4] <- mean(val_vec) | |
| } | |
| return(vals) | |
| } | |
| mean_vals <- c(NA, NA, mean(lalonde$educ), 1, NA, mean(lalonde$re74), mean(lalonde$re75)) | |
| prediction_intervals <- prediction_int(mean_vals) | |
| colnames(prediction_intervals) <- c('Age', '2.5%', '97.5%', 'Mean') | |
| prediction_intervals <- subset(prediction_intervals, select = -c(Mean)) | |
| prediction_intervals<- cbind(prediction_intervals, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75)) | |
| colnames(prediction_intervals)[4] <- "mean.educ" | |
| colnames(prediction_intervals)[5] <- "mean.re74" | |
| colnames(prediction_intervals)[6] <- "mean.re75" | |
| prediction_intervals | |
| plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(2000,15000), | |
| main = "re78 for Treatement units", xlab = "Age", | |
| ylab = "re78 (Prediction Interval)") | |
| for (age in 17:55) { | |
| segments( | |
| x0 = age, | |
| y0 = prediction_intervals[age-16, 2], | |
| x1 = age, | |
| y1 = prediction_intervals[age-16, 3], | |
| lwd = 2) | |
| } | |
| # Same process as above for CONTROL units | |
| ############## | |
| # QUESTION 2b # | |
| ############## | |
| # Include control units | |
| mean_vals_b <- c(NA, NA, mean(lalonde$educ), 0, NA, mean(lalonde$re74), mean(lalonde$re75)) | |
| prediction_intervals_b <- prediction_int(mean_vals_b) | |
| colnames(prediction_intervals_b) <- c('Age', '2.5%', '97.5%', 'Mean') | |
| prediction_intervals_b <- subset(prediction_intervals_b, select = -c(Mean)) | |
| prediction_intervals_b <- cbind(prediction_intervals_b, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75)) | |
| colnames(prediction_intervals_b)[4] <- "mean.educ" | |
| colnames(prediction_intervals_b)[5] <- "mean.re74" | |
| colnames(prediction_intervals_b)[6] <- "mean.re75" | |
| prediction_intervals_b | |
| plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-2000,11000), | |
| main = "re78 for Control units", xlab = "Age", | |
| ylab = "re78") | |
| for (age in 17:55) { | |
| segments( | |
| x0 = age, | |
| y0 = prediction_intervals_b[age-16, 2], | |
| x1 = age, | |
| y1 = prediction_intervals_b[age-16, 3], | |
| lwd = 2) | |
| } | |
| ############## | |
| # QUESTION 2c # | |
| ############## | |
| prediction_int_ATE <- function(values){ | |
| #creates control and treated unit | |
| treatment <- values | |
| control <- values | |
| control[4] <- 0 | |
| vars <- matrix(NA, nrow = 39, ncol = 4) | |
| for (i in 17:55){ | |
| vars[i-16,1] <- i | |
| val_vec <- rep(0,10000) | |
| treatment[1] <- i | |
| control[1] <- i | |
| treatment[2] <- (i^2)/100 | |
| control[2] <- (i^2)/100 | |
| treatment[5] <- i*treatment[4] | |
| control[5] <- i*control[4] | |
| for (j in 1:10000){ | |
| val_vec[j] <- intervals(sim.lalonde@coef[j,], treatment) - | |
| intervals(sim.lalonde@coef[j,], control) | |
| } | |
| vars[i-16,2] <- quantile(val_vec, prob = c(0.025,0.975))[1] | |
| vars[i-16,3] <- quantile(val_vec, prob = c(0.025,0.975))[2] | |
| vars[i-16,4] <- mean(val_vec) | |
| } | |
| return(vars) | |
| } | |
| prediction_int_c <- prediction_int_ATE(mean_vals) | |
| colnames(prediction_int_c) <- c('Age', '2.5%', '97.5%', 'Mean') | |
| prediction_int_c <- subset(prediction_int_c, select = -c(Mean)) | |
| prediction_int_c <- cbind(prediction_int_c, mean(lalonde$educ), mean(lalonde$re74), mean(lalonde$re75)) | |
| colnames(prediction_int_c)[4] <- "mean.educ" | |
| colnames(prediction_int_c)[5] <- "mean.re74" | |
| colnames(prediction_int_c)[6] <- "mean.re75" | |
| prediction_int_c | |
| plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-1200,10000), | |
| main = "Prediction Intervals of Treatment effect on re78", xlab = "Age", | |
| ylab = "Prediction Intervals of Treatment effect on re78") | |
| for (age in 17:55) { | |
| segments( | |
| x0 = age, | |
| y0 = prediction_int_c[age-16, 2], | |
| x1 = age, | |
| y1 = prediction_int_c[age-16, 3], | |
| lwd = 2) | |
| } | |
| ############## | |
| # QUESTION 2d # | |
| ############## | |
| intervals_d <- function(coefs, sigma, values) { | |
| expected_value <- coefs[1] + values[1]*coefs[2] + | |
| values[2]*coefs[3] + values[3]*coefs[4] + | |
| values[4]*coefs[5] + values[5]*coefs[6] + | |
| values[6]*coefs[7] + values[7]*coefs[8] + rnorm(1,0,sigma) | |
| return(expected_value) | |
| } | |
| prediction_int_sigma <- function(values){ | |
| treatment <- values | |
| control <- values | |
| control[4] <- 0 | |
| vals <- matrix(NA, nrow = 39, ncol = 4) | |
| for (i in 17:55){ | |
| vals[i-16,1] <- i | |
| vals_vec <- rep(0,10000) | |
| treatment[1] <- i | |
| control[1] <- i | |
| treatment[2] <- (i^2)/100 | |
| control[2] <- (i^2)/100 | |
| treatment[5] <- i*treatment[4] | |
| control[5] <- i*control[4] | |
| for (j in 1:10000){ | |
| vals_vec[j] <- intervals_d(sim.lalonde@coef[j,], sim.lalonde@sigma[j], treatment) - intervals_d(sim.lalonde@coef[j,], sim.lalonde@sigma[j], control) | |
| } | |
| vals[i-16,2] <- quantile(vals_vec, prob = c(0.025,0.975))[1] | |
| vals[i-16,3] <- quantile(vals_vec, prob = c(0.025,0.975))[2] | |
| vals[i-16,4] <- median(vals_vec) | |
| } | |
| return(vals) | |
| } | |
| mean_vals_d <- c(NA, NA, median(lalonde$educ), 1, NA, median(lalonde$re74), median(lalonde$re75)) | |
| prediction_int_d <- prediction_int_sigma(mean_vals_d) | |
| colnames(prediction_int_d) <- c('Age','2.5%','97.5%','Median') | |
| prediction_int_d <- subset(prediction_int_d, select = -c(Median)) | |
| prediction_int_d <- cbind(prediction_int_d, median(lalonde$educ), median(lalonde$re74), median(lalonde$re75)) | |
| colnames(prediction_int_d)[4] <- "median.educ" | |
| colnames(prediction_int_d)[5] <- "median.re74" | |
| colnames(prediction_int_d)[6] <- "median.re75" | |
| prediction_int_d | |
| plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-20000,25000), | |
| main = "Prediction Interval Treatment Effect re78", xlab = "Age", | |
| ylab = "Prediction Intervals of Treatment effect on re78") | |
| for (age in 17:55) { | |
| segments( | |
| x0 = age, | |
| y0 = prediction_int_d[age-16, 2], | |
| x1 = age, | |
| y1 = prediction_int_d[age-16, 3], | |
| lwd = 2) | |
| } | |
| #------------------------------------------------------------- | |
| ############## | |
| # QUESTION 3a # | |
| ############## | |
| library(boot) | |
| foo <- read.csv("y2prc9xq.csv") | |
| boot.fn = function(data, index){coef(lm(MATH_SCORE ~ as.numeric(I(TREATMENT)), data = foo, subset = index))} | |
| coefs = boot(foo, boot.fn, 100000)$t[,2] | |
| bootstrap = quantile(coefs, probs = c(0.025, 0.975)) | |
| analytical = confint(lm(MATH_SCORE ~ as.numeric(I(TREATMENT)), data = foo))[2,] | |
| rbind(bootstrap, analytical) | |
| summary(coefs) | |
| ############## | |
| # QUESTION 3b # | |
| ############## | |
| hist(coefs, breaks = 30, | |
| main = "100000 Boostrap Estimates of Treatment Coefficient", | |
| xlab = "Estimates of Coefficient") | |
| #------------------------------------------------------------- | |
| ############## | |
| # QUESTION 4 # | |
| ############## | |
| r_squared <- function(y_pred, ry_true) { | |
| r2 <- NA | |
| for (i in (1:10000) ){ | |
| pos <- sample(1:length(y_true), length(y_true), replace = TRUE) | |
| r2[i] <- 1- sum((y_true[pos]-y_pred[pos])^2) / sum((y_true[pos]-mean(y_true[pos]))^2) | |
| } | |
| return(mean(r2)) | |
| } | |
| lm.boot <- lm(MATH_SCORE ~ TREATMENT, data=foo) | |
| y_true <- foo$MATH_SCORE[-which(is.na(foo$MATH_SCORE))] | |
| r_squared(predict(lm.boot), y_true) | |
| summary(lm(MATH_SCORE ~ TREATMENT, data=foo))$r.squared | |
| #--------------------------------------------------------------- | |
| ############## | |
| # QUESTION 5 # | |
| ############## | |
| set.seed(42) | |
| nsw.dta <- read.csv("nsw.csv") | |
| rows <- sample(1:length(nsw.dta$age), 2000, replace = FALSE) | |
| test <- nsw.dta[rows,] | |
| train <- nsw.dta[-rows,] | |
| model_1 <- glm(treat ~.-re78, data = train) | |
| model_2 <- glm(treat ~ age + age^2 + education, data = train) | |
| model_3 <- glm(treat ~ age + age^2 + education + nodegree + re74 + re75, data = train) | |
| model_4 <- glm(treat ~ age + age^2 + education + nodegree + | |
| re74 + re75 + nodegree*re74 + nodegree*re75, data = train) | |
| model_5 <- glm(treat ~ age + age^2 + education + nodegree + | |
| re74 + re75 + nodegree*re74 + nodegree*re75 + u74 + u75, data = train) | |
| library(boot) | |
| cv_error1 <- cv.glm(train, model_1) | |
| cv_error2 <- cv.glm(train, model_2) | |
| cv_error3 <- cv.glm(train, model_3) | |
| cv_error4 <- cv.glm(train, model_4) | |
| cv_error5 <- cv.glm(train, model_5) | |
| cv_error1$delta[1] | |
| cv_error2$delta[1] | |
| cv_error3$delta[1] | |
| cv_error4$delta[1] | |
| cv_error5$delta[1] | |
| y_1 <- predict(model_1, data = test) | |
| mse_1 <- mean((y_1-train$treat)^2) | |
| y_2 <- predict(model_2, data = test) | |
| mse_2 <- mean((y_2-train$treat)^2) | |
| y_3 <- predict(model_3, data = test) | |
| mse_3 <- mean((y_3-train$treat)^2) | |
| y_4 <- predict(model_4, data = test) | |
| mse_4 <- mean((y_4-train$treat)^2) | |
| y_5 <- predict(model_5, data = test) | |
| mse_5 <- mean((y_5-train$treat)^2) | |
| #saving results in a data frame | |
| LOOCV <- data.frame("LOOCV MSE" = c(cv_error1$delta[1], | |
| cv_error2$delta[1], | |
| cv_error3$delta[1], | |
| cv_error4$delta[1] | |
| ,cv_error5$delta[1]), | |
| "Test MSE" = c(mse_1, | |
| mse_2, | |
| mse_3, | |
| mse_4, | |
| mse_5)) | |
| LOOCV | |
| #---------------------------------------------------------------- | |
| ############## | |
| # QUESTION 6 # | |
| ############## | |
| library("rgenoud") | |
| library("DAAG") | |
| df <- lalonde[, -which(names(lalonde) == 're78')] | |
| df <- cbind(df, | |
| I(df$age)^2, | |
| I(df$educ)^2, | |
| I(df$nodegr)^2, | |
| I(df$re74)^2, | |
| I(df$re75)^2 | |
| ) | |
| final_df <- cbind(df, lalonde$re78) | |
| names(final_df)[17] <- "re78" | |
| model_selection <- function(features) { | |
| feature_indices <- cbind(final_df[,c(17, unique(features))]) | |
| lm1 <- CVlm(re78 ~ ., data = feature_indices, plotit = FALSE, m=10) # 10-fold CV | |
| # attributes(CVlm) returns cross-validated sum of MSE over all the folds, and also returns degrees of freedom | |
| # RSS = DF * MSE. Also normalize over the number of folds | |
| # return cross-validated RSS | |
| return(((attributes(lm1)$ms)/10) * (attributes(lm1)$df)) | |
| } | |
| GenOpt <- genoud(fn = model_selection, nvars = 16, data.type.int = TRUE, | |
| Domains = matrix(c(rep(1, 16), rep(16, 16)), nrow = 16, ncol = 2), | |
| max = FALSE, pop.size = 100, max.generations = 10, | |
| wait.generations = 6, boundary.enforcement = 2) | |
| sort(unique(GenOpt$par)) | |
| gen_data <- final_df[c(1,3,6,7,11,16, 17)] | |
| final.lm <- lm(re78 ~ ., data = gen_data) | |
| sum((summary(final.lm)$residuals)^2) | |
| #------------------------------------------------------ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment