diamonaj/CS130_voting.R

## CS130_voting.R
#####################################
#####################################
#####################################

# Run a logistic regression and generate expected values of turnout, holding some predictors constant while others vary.


df <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSjMB6H07nhXmgdARkAMkUVWt7QPlzdD-RrDor2g_BSL9vil8V4efJ-iO-nQtZuqPE_klZPi6qNJ9Pw/pub?gid=921072292&single=true&output=csv")

glm2 <- glm(turnout ~ ., data = df, family = binomial)
summary(glm2)
#white
#age
#educate
#income
#agesqrd
#####################################
#####################################
#####################################

# CODE CELL 2

newdata_creator <- function(white = NULL,
                            age = NULL,
                            educate = NULL,
                            income = NULL)
{
  dataf <- data.frame(
    white = white,
    age = age,
    educate = educate,
    income = income,
    agesqrd = age^2 / 100
  )

return(dataf)
}


new_data_thisperson <- newdata_creator(0,
                                       38,
                                       12,
                                       4)
predict(glm2, newdata = new_data_thisperson, type = "response")
#74%

new_data_educatedperson <- newdata_creator(0,
                                       38,
                                       20,
                                       4)
predict(glm2, newdata = new_data_educatedperson, type = "response")
#92%


new_data_lastperson <- newdata_creator(mean(df$white),
                                       38,
                                       16,
                                       mean(df$income))
predict(glm2, newdata = new_data_lastperson, type = "response")
#86.7%
#####################################
#####################################
#####################################

# CODE CELL 3
######obtain critical values for 99% confidence interval
###### wider than the 95% confidence interval
qq <- qnorm(c(0.005, 0.995)) # -2.575829  2.575829
# -2.575829  2.575829

new_data_lastperson <- newdata_creator(mean(df$white),
                                       38,
                                       16,
                                       mean(df$income))


# now predict for that person, on the LINEAR SCALE
predict(glm2, newdata = new_data_lastperson, se.fit = TRUE)
## 1.88 is the point-estimate of the linear logistic prediction
## 0.038 is the standard error associated with that prediction
## we need to go 2.57 up and down from 1.88, then apply log f(x)

# THEN CALCULATE THE INTERVAL OF PROBABILITIES using logistic function---the "plogis()" command applies the simple logistic function we learned in class.  qq[1] and qq[2] are +/- 2.57
plogis(1.880385  + qq[1]*0.0389619) # we are going down -2.57*
# lower bound 99% CI for prediction = 0.855700051195481

plogis(1.880385 + qq[2]*0.0389619)  # we are going up   +2.57*
# upper bound 99% CI for prediction = 0.878760472302368

0.878760472302368
###### LOOPING NOW THRU EVERY AGE ##################
storage_vector_undergrad <- matrix(NA, nrow = 2, ncol = 78)
for (age in c(18:95)) {
    pr <- predict(glm2,
                  newdata = newdata_creator(mean(df$white),
                                            age,
                                            16,
                                            mean(df$income)),
                  se.fit = TRUE)

    storage_vector_undergrad[, age - 17] <-
       c(plogis(pr$fit + qq[1]*pr$se.fit), # lower bound
         plogis(pr$fit + qq[2]*pr$se.fit)) # upper bound
  }


# LET'S LOOK AT THE STORAGE MATRIX
storage_vector_undergrad[,1:5]
#####################################
#####################################
#####################################

## CODE CELL 4

storage_vector_undergrad # to see relevant col, it's age - 17 (e.g., 38 - 17)


### REPEAT NOW FOR PEOPLE WITH ONLY 12 YEARS (HS) EDUCATION
storage_vector_highschool <- matrix(NA, nrow = 2, ncol = 78)
for (age in c(18:95)) {
  pr <- predict(glm2,
                newdata = newdata_creator(
                    mean(df$white),
                    age,
                    12,
                    mean(df$income)),
                se.fit = TRUE)


    storage_vector_highschool[, age - 17] <-
       c(plogis(pr$fit + qq[1]*pr$se.fit), # lower bound
         plogis(pr$fit + qq[2]*pr$se.fit)) # upper bound
}

# ONLY CREATE AXES AND LABELS--NO DATA PLOTTED
plot(x = c(1:100), y = c(1:100), type = "n",
     xlim = c(18,95), ylim = c(0,1),
     main = "Probability of voting by age",
     xlab = "Age of Respondent",
     ylab = "Probability of Voting")

#####################################
#####################################
#####################################

# CODE CELL 5

# ACTUALLY PLOT THE LINE SEGMENTS IN THE PLOT

# FIRST, CALL THE PLOT AGAIN
# ONLY CREATE AXES AND LABELS--NO DATA PLOTTED
plot(x = c(1:100), y = c(1:100), type = "n",
     xlim = c(18,95), ylim = c(0,1),
     main = "Probability of voting by age",
     xlab = "Age of Respondent",
     ylab = "Probability of Voting")

## THEN DRAW THE SEGMENTS IN THE PLOT
for (age in 18:95) {
  segments(
    x0 = age,
    y0 = storage_vector_undergrad[1, age - 17],
    x1 = age,
    y1 = storage_vector_undergrad[2, age - 17],
    lwd = 2)
}
for (age in 18:95) {
  segments(
    x0 = age,
    y0 = storage_vector_highschool[1, age - 17],
    x1 = age,
    y1 = storage_vector_highschool[2, age - 17],
    lwd = 2)
}
	#####################################
	#####################################
	#####################################

	# Run a logistic regression and generate expected values of turnout, holding some predictors constant while others vary.


	df <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSjMB6H07nhXmgdARkAMkUVWt7QPlzdD-RrDor2g_BSL9vil8V4efJ-iO-nQtZuqPE_klZPi6qNJ9Pw/pub?gid=921072292&single=true&output=csv")

	glm2 <- glm(turnout ~ ., data = df, family = binomial)
	summary(glm2)
	#white
	#age
	#educate
	#income
	#agesqrd
	#####################################
	#####################################
	#####################################

	# CODE CELL 2

	newdata_creator <- function(white = NULL,
	age = NULL,
	educate = NULL,
	income = NULL)
	{
	dataf <- data.frame(
	white = white,
	age = age,
	educate = educate,
	income = income,
	agesqrd = age^2 / 100
	)

	return(dataf)
	}



	new_data_thisperson <- newdata_creator(0,
	38,
	12,
	4)
	predict(glm2, newdata = new_data_thisperson, type = "response")
	#74%

	new_data_educatedperson <- newdata_creator(0,
	38,
	20,
	4)
	predict(glm2, newdata = new_data_educatedperson, type = "response")
	#92%


	new_data_lastperson <- newdata_creator(mean(df$white),
	38,
	16,
	mean(df$income))
	predict(glm2, newdata = new_data_lastperson, type = "response")
	#86.7%
	#####################################
	#####################################
	#####################################

	# CODE CELL 3
	######obtain critical values for 99% confidence interval
	###### wider than the 95% confidence interval
	qq <- qnorm(c(0.005, 0.995)) # -2.575829 2.575829
	# -2.575829 2.575829

	new_data_lastperson <- newdata_creator(mean(df$white),
	38,
	16,
	mean(df$income))


	# now predict for that person, on the LINEAR SCALE
	predict(glm2, newdata = new_data_lastperson, se.fit = TRUE)
	## 1.88 is the point-estimate of the linear logistic prediction
	## 0.038 is the standard error associated with that prediction
	## we need to go 2.57 up and down from 1.88, then apply log f(x)

	# THEN CALCULATE THE INTERVAL OF PROBABILITIES using logistic function---the "plogis()" command applies the simple logistic function we learned in class. qq[1] and qq[2] are +/- 2.57
	plogis(1.880385 + qq[1]0.0389619) # we are going down -2.57
	# lower bound 99% CI for prediction = 0.855700051195481

	plogis(1.880385 + qq[2]0.0389619) # we are going up +2.57
	# upper bound 99% CI for prediction = 0.878760472302368

	0.878760472302368
	###### LOOPING NOW THRU EVERY AGE ##################
	storage_vector_undergrad <- matrix(NA, nrow = 2, ncol = 78)
	for (age in c(18:95)) {
	pr <- predict(glm2,
	newdata = newdata_creator(mean(df$white),
	age,
	16,
	mean(df$income)),
	se.fit = TRUE)

	storage_vector_undergrad[, age - 17] <-
	c(plogis(pr$fit + qq[1]*pr$se.fit), # lower bound
	plogis(pr$fit + qq[2]*pr$se.fit)) # upper bound
	}


	# LET'S LOOK AT THE STORAGE MATRIX
	storage_vector_undergrad[,1:5]
	#####################################
	#####################################
	#####################################

	## CODE CELL 4

	storage_vector_undergrad # to see relevant col, it's age - 17 (e.g., 38 - 17)


	### REPEAT NOW FOR PEOPLE WITH ONLY 12 YEARS (HS) EDUCATION
	storage_vector_highschool <- matrix(NA, nrow = 2, ncol = 78)
	for (age in c(18:95)) {
	pr <- predict(glm2,
	newdata = newdata_creator(
	mean(df$white),
	age,
	12,
	mean(df$income)),
	se.fit = TRUE)


	storage_vector_highschool[, age - 17] <-
	c(plogis(pr$fit + qq[1]*pr$se.fit), # lower bound
	plogis(pr$fit + qq[2]*pr$se.fit)) # upper bound
	}

	# ONLY CREATE AXES AND LABELS--NO DATA PLOTTED
	plot(x = c(1:100), y = c(1:100), type = "n",
	xlim = c(18,95), ylim = c(0,1),
	main = "Probability of voting by age",
	xlab = "Age of Respondent",
	ylab = "Probability of Voting")

	#####################################
	#####################################
	#####################################

	# CODE CELL 5

	# ACTUALLY PLOT THE LINE SEGMENTS IN THE PLOT

	# FIRST, CALL THE PLOT AGAIN
	# ONLY CREATE AXES AND LABELS--NO DATA PLOTTED
	plot(x = c(1:100), y = c(1:100), type = "n",
	xlim = c(18,95), ylim = c(0,1),
	main = "Probability of voting by age",
	xlab = "Age of Respondent",
	ylab = "Probability of Voting")

	## THEN DRAW THE SEGMENTS IN THE PLOT
	for (age in 18:95) {
	segments(
	x0 = age,
	y0 = storage_vector_undergrad[1, age - 17],
	x1 = age,
	y1 = storage_vector_undergrad[2, age - 17],
	lwd = 2)
	}
	for (age in 18:95) {
	segments(
	x0 = age,
	y0 = storage_vector_highschool[1, age - 17],
	x1 = age,
	y1 = storage_vector_highschool[2, age - 17],
	lwd = 2)
	}
No results found