SermetPekin/SingleFile.R

## SingleFile.R
# Reproduction of Andrew Gelman's blog post:
# "Reanalysis of that Nobel prizewinning study of patents and innovation (with R and Stan code)"
# https://statmodeling.stat.columbia.edu/2025/10/21/reanalysis-of-that-nobel-prizewinning-study-of-patents-and-innovation/
# This script installs packages, downloads data, compiles Stan models, and reproduces plots in one run.

# ---- install-and-load ------------------------------------------------------
# Packages you need
pkgs <- c("haven", "arm", "rstanarm", "MASS", "cmdstanr")

# Ensure pak is available (fast, respects binaries)
if (!requireNamespace("pak", quietly = TRUE)) {
  install.packages("pak")
}

# Install any missing packages
missing <- setdiff(pkgs, rownames(installed.packages()))
if (length(missing)) {
  pak::pak(missing)
}

# Load packages quietly (stop with a clear message if something fails)
for (p in pkgs) {
  suppressPackageStartupMessages(
    library(p, character.only = TRUE)
  )
}

# OPTIONAL: One-time CmdStan toolchain install (comment out if not desired)
if (isTRUE(requireNamespace("cmdstanr", quietly = TRUE))) {
  cmdstan_path <- tryCatch(cmdstanr::cmdstan_path(), error = function(e) NA)
  if (is.na(cmdstan_path) || !dir.exists(cmdstan_path)) {
    message("Installing CmdStan (one-time build)…")
    cmdstanr::install_cmdstan()  # may take several minutes the first time
  }
}


hinge_stan <- "
functions {
  vector logistic_hinge(vector x, real x0, real a, real b0, real b1, real delta) {
    vector[size(x)] xdiff = x - x0;
    return a + b0 * xdiff + (b1 - b0) * delta * log1p_exp(xdiff / delta);
  }
}
data {
  real<lower=0> delta;
  int N;
  vector[N] x, y;
  int J1, J2;
  array[N] int<lower=1,upper=J1> group1;
  array[N] int<lower=1,upper=J2> group2;
}
parameters {
  real x0;
  real a, b0, b1;
  real<lower=0> sigma, sigma1, sigma2;
  vector<offset=0, multiplier=sigma1>[J1] a1;
  vector<offset=0, multiplier=sigma2>[J2] a2;
}
model {
  x0 ~ normal(1, 1);
  a ~ normal(0, 100);
  b0 ~ normal(0, 100);
  b1 ~ normal(0, 100);
  a1 ~ normal(0, sigma1);
  a2 ~ normal(0, sigma2);
  y ~ normal(logistic_hinge(x, x0, a, b0, b1, delta) + a1[group1] + a2[group2], sigma);
}
"

quadratic_stan <- "
data {
  int N;
  vector[N] x, y;
  int J1, J2;
  array[N] int<lower=1,upper=J1> group1;
  array[N] int<lower=1,upper=J2> group2;
}
parameters {
  real b0, b1, b2;
  real<lower=0> sigma, sigma1, sigma2;
  vector<offset=0, multiplier=sigma1>[J1] a1;
  vector<offset=0, multiplier=sigma2>[J2] a2;
}
model {
  b0 ~ normal(0, 100);
  b1 ~ normal(0, 100);
  b2 ~ normal(0, 100);
  a1 ~ normal(0, sigma1);
  a2 ~ normal(0, sigma2);
  y ~ normal(b0 + b1*x + b2*x^2 + a1[group1] + a2[group2], sigma);
}
"

# ---- 1) Helper: write a .stan file from a string, idempotently ------------
write_stan <- function(path, code) {
  dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE)
  if (!file.exists(path) || !identical(readLines(path, warn = FALSE), strsplit(code, "\n", fixed = TRUE)[[1]])) {
    writeLines(code, path, useBytes = TRUE)
  }
  normalizePath(path, winslash = "/", mustWork = TRUE)
}

stan_dir <- "stan"
hinge_file <- write_stan(file.path(stan_dir, "hinge_multilevel.stan"), hinge_stan)
quad_file  <- write_stan(file.path(stan_dir, "quadratic_multilevel.stan"), quadratic_stan)
hinge_multilevel <- cmdstan_model(hinge_file)
quadratic_multilevel <- cmdstan_model(quad_file)

set.seed(123)

# ---- Data download & read ----
url <- "https://sites.stat.columbia.edu/gelman/stuff_for_blog/abbgh_data.dta"
local_path <- "data/abbgh_data.dta"

# Create data folder if missing
if (!dir.exists("data")) dir.create("data", recursive = TRUE)

# Download only if not already saved
if (!file.exists(local_path)) {
  download.file(url, destfile = local_path, mode = "wb")
}

# Read
abbgh <- haven::read_dta(local_path)

abbgh$patcw_rounded <- round(abbgh$patcw)
industry_names <- c("Textiles", "Apparel", "Lumber", "Furniture",
                    "Leather", "Stone, glass, concrete", "Primary metal", "Fabricated medal", "Machinery and computers", "Electric, not computers", "Transport equipment", "Passenger transit", "Freight transport", "Postal service", "Transport services", "Communications", "Electric, gas services")

add_curve <- function(fit, multilevel=FALSE, logscale=FALSE, shift=0, ...) {
  if (multilevel) {
    beta_hat <- fixef(fit)[1:3]
  }
  else {
    beta_hat <- coef(fit)[1:3]
  }
  if (logscale)
    curve(shift + beta_hat[1] + beta_hat[2]*x + beta_hat[3]*x^2, ..., add=TRUE)
  else
    curve(exp(shift + beta_hat[1] + beta_hat[2]*x + beta_hat[3]*x^2), ..., add=TRUE)
}

pdf("invertedu_1.pdf", height=4, width=4)
par(mfrow=c(1,1), oma=c(0,0,2,0), mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
plot(abbgh$Lc, abbgh$patcw, xlab='"Competitiveness"', ylab="Weighted Patents", xlim=c(0.85, 1), pch=20, cex=.5, bty="l")
fit_1 <- glm(patcw_rounded ~ Lc + I(Lc^2), family=poisson(link="log"), data=abbgh)
display(fit_1)
add_curve(fit_1, col="green")
fit_2 <- glm(patcw_rounded ~ Lc + I(Lc^2), family=quasipoisson(link="log"), data=abbgh)
display(fit_2)
add_curve(fit_2, col="red")
fit_3 <- glm(patcw ~ Lc + I(Lc^2), family=quasipoisson(link="log"), data=abbgh)
display(fit_3)
add_curve(fit_3, col="blue")
fit_4 <- glm.nb(patcw_rounded ~ Lc + I(Lc^2), data=abbgh)
summary(fit_4)
add_curve(fit_4, col="pink")
fit_5 <- stan_glm(patcw_rounded ~ Lc + I(Lc^2), family=neg_binomial_2(link="log"), data=abbgh, refresh=0)
print(fit_5)
add_curve(fit_5, col="orange")
mtext("Quadratic regressions on original scale", side=3, line=0, outer=TRUE, cex=.9)

dev.off()

# The orange curve looks different from the others.
# Check by re-running the Bayesian negative binomial regression on optimize setting
fit_5_opt <- stan_glm(patcw_rounded ~ Lc + I(Lc^2), family=neg_binomial_2(link="log"), data=abbgh, algorithm="optimizing")
print(fit_5_opt)

adjust <- function(fit, data=abbgh, multilevel=FALSE) {
  if (multilevel) {
    sic2_effects <- ranef(fit)$sic2[,1]
    year_effects <- ranef(fit)$year[,1]
  }
  else {
    sic2_effects <- c(0, coef(fit)[grep("sic2", names(coef(fit)))])
    year_effects <- c(0, coef(fit)[grep("year", names(coef(fit)))])
  }
  sic2_effects + mean(year_effects)
}

fit_3a <- glm(patcw ~ Lc + I(Lc^2) + factor(sic2) + factor(year), family=quasipoisson(link="log"), data=abbgh)
display(fit_3a)
fit_4a <- glm.nb(patcw_rounded ~ Lc + I(Lc^2) + factor(sic2) + factor(year), data=abbgh)
summary(fit_4a)
fit_5a <- stan_glm(patcw_rounded ~ Lc + I(Lc^2) + factor(sic2) + factor(year), family=neg_binomial_2(link="log"), data=abbgh, refresh=0)
print(fit_5a)
fit_6a <- stan_glmer(patcw_rounded ~ Lc + I(Lc^2) + (1 | sic2) + (1 |  year), family=neg_binomial_2(link="log"), data=abbgh, refresh=0)
print(fit_6a)


industries <- unique(abbgh$sic2)
n_industries <- length(industries)
avg_patcw <- rep(NA, n_industries)
avg_Lc <- rep(NA, n_industries)
count <- 0
for (j in industries) {
  count <- count + 1
  cond <- abbgh$sic2==j
  avg_patcw[count] <- mean(abbgh$patcw[cond])
  avg_Lc[count] <- mean(abbgh$Lc[cond])
}
industries_sort <- rev(order(avg_patcw))

plot_grid <- function(fits, multilevel, logscale, colors, n_rows=4, n_cols=5, title){
  par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02)
  x <- abbgh$Lc
  y <- if (logscale) log(abbgh$patcw + 1) else abbgh$patcw
  y_label <- if (logscale) "log (Wt Patents + 1)" else "Weighted Patents"
  for (i in 1:n_industries) {
    j <- industries_sort[i]
    grid_left <- i%%n_cols == 1
    grid_bottom <- i > n_industries - n_cols
    plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) y_label else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8)
    for (k in 1:length(fits)) {
      add_curve(fits[[k]], multilevel=multilevel[k], logscale=logscale,
                shift=adjust(fits[[k]],multilevel=multilevel[k])[j], col=colors[k])
    }
    cond <- abbgh$sic2==industries[j]
    points(x[cond], y[cond], pch=20, cex=.5)
    lines(x[cond], y[cond], lwd=.5)
    points(x[cond][1], y[cond][1], pch=20, cex=1.5)
  }
  mtext(title, side=3, line=1, outer=TRUE, cex=.8)
}

pdf("invertedu_2.pdf", height=5, width=7)
plot_grid(list(fit_3a, fit_4a, fit_5a, fit_6a), multilevel=c(FALSE, FALSE, FALSE, TRUE), logscale=FALSE, colors=c("blue","pink","orange","purple"), title="Quadratic regressions on original scale, adjusting for industry and year")
dev.off()

abbgh$log_patcw <- log(abbgh$patcw + 1)

pdf("invertedu_3.pdf", height=4, width=4)
par(mfrow=c(1,1), oma=c(0,0,2,0), mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
plot(abbgh$Lc, abbgh$log_patcw, xlab='"Competitiveness"', ylab="log (Wt Patents + 1)", xlim=c(0.85, 1), pch=20, cex=.5, bty="l")
fit_7 <- lm(log_patcw ~ Lc + I(Lc^2), data=abbgh)
display(fit_7)
add_curve(fit_7, logscale="TRUE", col="green")
mtext("Quadratic regressions on log scale", side=3, line=0, outer=TRUE, cex=.9)
dev.off()

fit_8 <- lm(log_patcw ~ Lc + I(Lc^2) + factor(sic2) + factor(year), data=abbgh)
summary(fit_8)
fit_9 <- stan_glmer(log_patcw ~ Lc + I(Lc^2) + (1 | sic2) + (1 |  year), data=abbgh, refresh=0)
print(fit_9)
pdf("invertedu_4.pdf", height=5, width=7)
plot_grid(list(fit_8, fit_9), multilevel=c(FALSE, TRUE), logscale=TRUE, colors=c("orange","purple"),
          title="Quadratic regressions on log scale, adjusting for industry and year")
dev.off()

industries_in_order <- rep(NA, n_industries)
for (j in 1:n_industries) {
  industries_in_order[abbgh$sic2==industries[j]] <- j
}
abbgh$avg_Lc <- avg_Lc[industries_in_order]

years <- unique(abbgh$year)
n_years <- length(years)
years_in_order <- rep(NA, n_years)
for (j in 1:n_years) {
  years_in_order[abbgh$year==years[j]] <- j
}

fit_10 <- stan_glmer(log_patcw ~ Lc + I(Lc^2) + avg_Lc + (1 | sic2) + (1 |  year), data=abbgh, refresh=0)
print(fit_10)

logistic_hinge <- function(x, x0, a, b0, b1, delta) {
  return(a + b0*(x-x0) + (b1-b0)*delta * log(1 + exp((x-x0)/delta)))
}

plot_grid_hinge <- function(fit, n_rows=4, n_cols=5, title){
  par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02)
  x <- abbgh$Lc
  y <- log(abbgh$patcw + 1)
  sims <- fit$draws(format="df")
  x0_hat <- median(sims$x0)
  a_hat <- median(sims$a)
  b0_hat <- median(sims$b0)
  b1_hat <- median(sims$b1)
  n_sims <- nrow(sims)
  for (i in 1:n_industries) {
    j <- industries_sort[i]
    grid_left <- i%%n_cols == 1
    grid_bottom <- i > n_industries - n_cols
    plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) "log (Wt. Patents + 1)" else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8)
    a1_sims <- as.matrix(fit$draws("a1", format="df"))[,1:n_industries]
    a2_sims <- as.matrix(fit$draws("a2", format="df"))[,1:n_years]
    for (s in sample(n_sims, 20)) {
      curve(logistic_hinge(x, sims$x0[s], sims$a[s], sims$b0[s], sims$b1[s], delta_true) + a1_sims[s,j] + mean(a2_sims[s,]), col="red", lwd=0.5, add=TRUE)
    }
    a1_hat <- apply(a1_sims, 2, median)
    a2_hat <- apply(a1_sims, 2, median)
    curve(logistic_hinge(x, x0_hat, a_hat, b0_hat, b1_hat, delta_true) + a1_hat[j] + mean(a2_hat), col="blue", lwd=1.5, add=TRUE)
    cond <- abbgh$sic2==industries[j]
    points(x[cond], y[cond], pch=20, cex=.5)
    lines(x[cond], y[cond], lwd=.5)
    points(x[cond][1], y[cond][1], pch=20, cex=1.5)
  }
  mtext(title, side=3, line=1, outer=TRUE, cex=.8)
}

delta_true <- 0.05
stan_data <- list(delta=delta_true, N=nrow(abbgh), x=abbgh$Lc, y=log(abbgh$patcw + 1), J1=n_industries, J2=n_years, group1=industries_in_order, group2=years_in_order)
fit_11 <- hinge_multilevel$sample(stan_data, parallel_chains=4, refresh=0)
print(fit_11)
sims_11 <- fit_11$draws(format="df")
print(mean(sims_11$b1 < 0))

pdf("invertedu_5.pdf", height=5, width=7)
plot_grid_hinge(fit_11, title="Hinge regression with uncertainty on log scale, adjusting for industry and year")
dev.off()

plot_grid_quadratic <- function(fit, n_rows=4, n_cols=5, title){
  par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02)
  x <- abbgh$Lc
  y <- log(abbgh$patcw + 1)
  sims <- fit$draws(format="df")
  b0_hat <- median(sims$b0)
  b1_hat <- median(sims$b1)
  b2_hat <- median(sims$b2)
  n_sims <- nrow(sims)
  for (i in 1:n_industries) {
    j <- industries_sort[i]
    grid_left <- i%%n_cols == 1
    grid_bottom <- i > n_industries - n_cols
    plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) "log (Wt. Patents + 1)" else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8)
    a1_sims <- as.matrix(fit$draws("a1", format="df"))[,1:n_industries]
    a2_sims <- as.matrix(fit$draws("a2", format="df"))[,1:n_years]
    for (s in sample(n_sims, 20)) {
      curve(sims$b0[s] + sims$b1[s]*x + sims$b2[s]*x^2 + a1_sims[s,j] + mean(a2_sims[s,]), col="red", lwd=0.5, add=TRUE)
    }
    a1_hat <- apply(a1_sims, 2, median)
    a2_hat <- apply(a1_sims, 2, median)
    curve(b0_hat + b1_hat*x + b2_hat*x^2 + a1_hat[j] + mean(a2_hat), col="blue", lwd=1.5, add=TRUE)
    cond <- abbgh$sic2==industries[j]
    points(x[cond], y[cond], pch=20, cex=.5)
    lines(x[cond], y[cond], lwd=.5)
    points(x[cond][1], y[cond][1], pch=20, cex=1.5)
  }
  mtext(title, side=3, line=1, outer=TRUE, cex=.8)
}

fit_12 <- quadratic_multilevel$sample(stan_data, parallel_chains=4, refresh=0)
print(fit_12)
sims_12 <- fit_12$draws(format="df")
x_peak <- -sims_12$b1/(2*sims_12$b2)
print(mean(x_peak > min(abbgh$Lc) & x_peak < max(abbgh$Lc)))

pdf("invertedu_6.pdf", height=5, width=7)
plot_grid_quadratic(fit_12, title="Quadratic regression with uncertainty on log scale, adjusting for industry and year")
dev.off()
	# Reproduction of Andrew Gelman's blog post:
	# "Reanalysis of that Nobel prizewinning study of patents and innovation (with R and Stan code)"
	# https://statmodeling.stat.columbia.edu/2025/10/21/reanalysis-of-that-nobel-prizewinning-study-of-patents-and-innovation/
	# This script installs packages, downloads data, compiles Stan models, and reproduces plots in one run.

	# ---- install-and-load ------------------------------------------------------
	# Packages you need
	pkgs <- c("haven", "arm", "rstanarm", "MASS", "cmdstanr")

	# Ensure pak is available (fast, respects binaries)
	if (!requireNamespace("pak", quietly = TRUE)) {
	install.packages("pak")
	}

	# Install any missing packages
	missing <- setdiff(pkgs, rownames(installed.packages()))
	if (length(missing)) {
	pak::pak(missing)
	}

	# Load packages quietly (stop with a clear message if something fails)
	for (p in pkgs) {
	suppressPackageStartupMessages(
	library(p, character.only = TRUE)
	)
	}

	# OPTIONAL: One-time CmdStan toolchain install (comment out if not desired)
	if (isTRUE(requireNamespace("cmdstanr", quietly = TRUE))) {
	cmdstan_path <- tryCatch(cmdstanr::cmdstan_path(), error = function(e) NA)
	if (is.na(cmdstan_path) \|\| !dir.exists(cmdstan_path)) {
	message("Installing CmdStan (one-time build)…")
	cmdstanr::install_cmdstan() # may take several minutes the first time
	}
	}


	hinge_stan <- "
	functions {
	vector logistic_hinge(vector x, real x0, real a, real b0, real b1, real delta) {
	vector[size(x)] xdiff = x - x0;
	return a + b0 * xdiff + (b1 - b0) * delta * log1p_exp(xdiff / delta);
	}
	}
	data {
	real<lower=0> delta;
	int N;
	vector[N] x, y;
	int J1, J2;
	array[N] int<lower=1,upper=J1> group1;
	array[N] int<lower=1,upper=J2> group2;
	}
	parameters {
	real x0;
	real a, b0, b1;
	real<lower=0> sigma, sigma1, sigma2;
	vector<offset=0, multiplier=sigma1>[J1] a1;
	vector<offset=0, multiplier=sigma2>[J2] a2;
	}
	model {
	x0 ~ normal(1, 1);
	a ~ normal(0, 100);
	b0 ~ normal(0, 100);
	b1 ~ normal(0, 100);
	a1 ~ normal(0, sigma1);
	a2 ~ normal(0, sigma2);
	y ~ normal(logistic_hinge(x, x0, a, b0, b1, delta) + a1[group1] + a2[group2], sigma);
	}
	"

	quadratic_stan <- "
	data {
	int N;
	vector[N] x, y;
	int J1, J2;
	array[N] int<lower=1,upper=J1> group1;
	array[N] int<lower=1,upper=J2> group2;
	}
	parameters {
	real b0, b1, b2;
	real<lower=0> sigma, sigma1, sigma2;
	vector<offset=0, multiplier=sigma1>[J1] a1;
	vector<offset=0, multiplier=sigma2>[J2] a2;
	}
	model {
	b0 ~ normal(0, 100);
	b1 ~ normal(0, 100);
	b2 ~ normal(0, 100);
	a1 ~ normal(0, sigma1);
	a2 ~ normal(0, sigma2);
	y ~ normal(b0 + b1x + b2x^2 + a1[group1] + a2[group2], sigma);
	}
	"

	# ---- 1) Helper: write a .stan file from a string, idempotently ------------
	write_stan <- function(path, code) {
	dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE)
	if (!file.exists(path) \|\| !identical(readLines(path, warn = FALSE), strsplit(code, "\n", fixed = TRUE)[[1]])) {
	writeLines(code, path, useBytes = TRUE)
	}
	normalizePath(path, winslash = "/", mustWork = TRUE)
	}

	stan_dir <- "stan"
	hinge_file <- write_stan(file.path(stan_dir, "hinge_multilevel.stan"), hinge_stan)
	quad_file <- write_stan(file.path(stan_dir, "quadratic_multilevel.stan"), quadratic_stan)
	hinge_multilevel <- cmdstan_model(hinge_file)
	quadratic_multilevel <- cmdstan_model(quad_file)

	set.seed(123)

	# ---- Data download & read ----
	url <- "https://sites.stat.columbia.edu/gelman/stuff_for_blog/abbgh_data.dta"
	local_path <- "data/abbgh_data.dta"

	# Create data folder if missing
	if (!dir.exists("data")) dir.create("data", recursive = TRUE)

	# Download only if not already saved
	if (!file.exists(local_path)) {
	download.file(url, destfile = local_path, mode = "wb")
	}

	# Read
	abbgh <- haven::read_dta(local_path)

	abbgh$patcw_rounded <- round(abbgh$patcw)
	industry_names <- c("Textiles", "Apparel", "Lumber", "Furniture",
	"Leather", "Stone, glass, concrete", "Primary metal", "Fabricated medal", "Machinery and computers", "Electric, not computers", "Transport equipment", "Passenger transit", "Freight transport", "Postal service", "Transport services", "Communications", "Electric, gas services")

	add_curve <- function(fit, multilevel=FALSE, logscale=FALSE, shift=0, ...) {
	if (multilevel) {
	beta_hat <- fixef(fit)[1:3]
	}
	else {
	beta_hat <- coef(fit)[1:3]
	}
	if (logscale)
	curve(shift + beta_hat[1] + beta_hat[2]x + beta_hat[3]x^2, ..., add=TRUE)
	else
	curve(exp(shift + beta_hat[1] + beta_hat[2]x + beta_hat[3]x^2), ..., add=TRUE)
	}

	pdf("invertedu_1.pdf", height=4, width=4)
	par(mfrow=c(1,1), oma=c(0,0,2,0), mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
	plot(abbgh$Lc, abbgh$patcw, xlab='"Competitiveness"', ylab="Weighted Patents", xlim=c(0.85, 1), pch=20, cex=.5, bty="l")
	fit_1 <- glm(patcw_rounded ~ Lc + I(Lc^2), family=poisson(link="log"), data=abbgh)
	display(fit_1)
	add_curve(fit_1, col="green")
	fit_2 <- glm(patcw_rounded ~ Lc + I(Lc^2), family=quasipoisson(link="log"), data=abbgh)
	display(fit_2)
	add_curve(fit_2, col="red")
	fit_3 <- glm(patcw ~ Lc + I(Lc^2), family=quasipoisson(link="log"), data=abbgh)
	display(fit_3)
	add_curve(fit_3, col="blue")
	fit_4 <- glm.nb(patcw_rounded ~ Lc + I(Lc^2), data=abbgh)
	summary(fit_4)
	add_curve(fit_4, col="pink")
	fit_5 <- stan_glm(patcw_rounded ~ Lc + I(Lc^2), family=neg_binomial_2(link="log"), data=abbgh, refresh=0)
	print(fit_5)
	add_curve(fit_5, col="orange")
	mtext("Quadratic regressions on original scale", side=3, line=0, outer=TRUE, cex=.9)

	dev.off()

	# The orange curve looks different from the others.
	# Check by re-running the Bayesian negative binomial regression on optimize setting
	fit_5_opt <- stan_glm(patcw_rounded ~ Lc + I(Lc^2), family=neg_binomial_2(link="log"), data=abbgh, algorithm="optimizing")
	print(fit_5_opt)

	adjust <- function(fit, data=abbgh, multilevel=FALSE) {
	if (multilevel) {
	sic2_effects <- ranef(fit)$sic2[,1]
	year_effects <- ranef(fit)$year[,1]
	}
	else {
	sic2_effects <- c(0, coef(fit)[grep("sic2", names(coef(fit)))])
	year_effects <- c(0, coef(fit)[grep("year", names(coef(fit)))])
	}
	sic2_effects + mean(year_effects)
	}

	fit_3a <- glm(patcw ~ Lc + I(Lc^2) + factor(sic2) + factor(year), family=quasipoisson(link="log"), data=abbgh)
	display(fit_3a)
	fit_4a <- glm.nb(patcw_rounded ~ Lc + I(Lc^2) + factor(sic2) + factor(year), data=abbgh)
	summary(fit_4a)
	fit_5a <- stan_glm(patcw_rounded ~ Lc + I(Lc^2) + factor(sic2) + factor(year), family=neg_binomial_2(link="log"), data=abbgh, refresh=0)
	print(fit_5a)
	fit_6a <- stan_glmer(patcw_rounded ~ Lc + I(Lc^2) + (1 \| sic2) + (1 \| year), family=neg_binomial_2(link="log"), data=abbgh, refresh=0)
	print(fit_6a)


	industries <- unique(abbgh$sic2)
	n_industries <- length(industries)
	avg_patcw <- rep(NA, n_industries)
	avg_Lc <- rep(NA, n_industries)
	count <- 0
	for (j in industries) {
	count <- count + 1
	cond <- abbgh$sic2==j
	avg_patcw[count] <- mean(abbgh$patcw[cond])
	avg_Lc[count] <- mean(abbgh$Lc[cond])
	}
	industries_sort <- rev(order(avg_patcw))

	plot_grid <- function(fits, multilevel, logscale, colors, n_rows=4, n_cols=5, title){
	par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02)
	x <- abbgh$Lc
	y <- if (logscale) log(abbgh$patcw + 1) else abbgh$patcw
	y_label <- if (logscale) "log (Wt Patents + 1)" else "Weighted Patents"
	for (i in 1:n_industries) {
	j <- industries_sort[i]
	grid_left <- i%%n_cols == 1
	grid_bottom <- i > n_industries - n_cols
	plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) y_label else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8)
	for (k in 1:length(fits)) {
	add_curve(fits[[k]], multilevel=multilevel[k], logscale=logscale,
	shift=adjust(fits[[k]],multilevel=multilevel[k])[j], col=colors[k])
	}
	cond <- abbgh$sic2==industries[j]
	points(x[cond], y[cond], pch=20, cex=.5)
	lines(x[cond], y[cond], lwd=.5)
	points(x[cond][1], y[cond][1], pch=20, cex=1.5)
	}
	mtext(title, side=3, line=1, outer=TRUE, cex=.8)
	}

	pdf("invertedu_2.pdf", height=5, width=7)
	plot_grid(list(fit_3a, fit_4a, fit_5a, fit_6a), multilevel=c(FALSE, FALSE, FALSE, TRUE), logscale=FALSE, colors=c("blue","pink","orange","purple"), title="Quadratic regressions on original scale, adjusting for industry and year")
	dev.off()

	abbgh$log_patcw <- log(abbgh$patcw + 1)

	pdf("invertedu_3.pdf", height=4, width=4)
	par(mfrow=c(1,1), oma=c(0,0,2,0), mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
	plot(abbgh$Lc, abbgh$log_patcw, xlab='"Competitiveness"', ylab="log (Wt Patents + 1)", xlim=c(0.85, 1), pch=20, cex=.5, bty="l")
	fit_7 <- lm(log_patcw ~ Lc + I(Lc^2), data=abbgh)
	display(fit_7)
	add_curve(fit_7, logscale="TRUE", col="green")
	mtext("Quadratic regressions on log scale", side=3, line=0, outer=TRUE, cex=.9)
	dev.off()

	fit_8 <- lm(log_patcw ~ Lc + I(Lc^2) + factor(sic2) + factor(year), data=abbgh)
	summary(fit_8)
	fit_9 <- stan_glmer(log_patcw ~ Lc + I(Lc^2) + (1 \| sic2) + (1 \| year), data=abbgh, refresh=0)
	print(fit_9)
	pdf("invertedu_4.pdf", height=5, width=7)
	plot_grid(list(fit_8, fit_9), multilevel=c(FALSE, TRUE), logscale=TRUE, colors=c("orange","purple"),
	title="Quadratic regressions on log scale, adjusting for industry and year")
	dev.off()

	industries_in_order <- rep(NA, n_industries)
	for (j in 1:n_industries) {
	industries_in_order[abbgh$sic2==industries[j]] <- j
	}
	abbgh$avg_Lc <- avg_Lc[industries_in_order]

	years <- unique(abbgh$year)
	n_years <- length(years)
	years_in_order <- rep(NA, n_years)
	for (j in 1:n_years) {
	years_in_order[abbgh$year==years[j]] <- j
	}

	fit_10 <- stan_glmer(log_patcw ~ Lc + I(Lc^2) + avg_Lc + (1 \| sic2) + (1 \| year), data=abbgh, refresh=0)
	print(fit_10)

	logistic_hinge <- function(x, x0, a, b0, b1, delta) {
	return(a + b0(x-x0) + (b1-b0)delta * log(1 + exp((x-x0)/delta)))
	}

	plot_grid_hinge <- function(fit, n_rows=4, n_cols=5, title){
	par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02)
	x <- abbgh$Lc
	y <- log(abbgh$patcw + 1)
	sims <- fit$draws(format="df")
	x0_hat <- median(sims$x0)
	a_hat <- median(sims$a)
	b0_hat <- median(sims$b0)
	b1_hat <- median(sims$b1)
	n_sims <- nrow(sims)
	for (i in 1:n_industries) {
	j <- industries_sort[i]
	grid_left <- i%%n_cols == 1
	grid_bottom <- i > n_industries - n_cols
	plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) "log (Wt. Patents + 1)" else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8)
	a1_sims <- as.matrix(fit$draws("a1", format="df"))[,1:n_industries]
	a2_sims <- as.matrix(fit$draws("a2", format="df"))[,1:n_years]
	for (s in sample(n_sims, 20)) {
	curve(logistic_hinge(x, sims$x0[s], sims$a[s], sims$b0[s], sims$b1[s], delta_true) + a1_sims[s,j] + mean(a2_sims[s,]), col="red", lwd=0.5, add=TRUE)
	}
	a1_hat <- apply(a1_sims, 2, median)
	a2_hat <- apply(a1_sims, 2, median)
	curve(logistic_hinge(x, x0_hat, a_hat, b0_hat, b1_hat, delta_true) + a1_hat[j] + mean(a2_hat), col="blue", lwd=1.5, add=TRUE)
	cond <- abbgh$sic2==industries[j]
	points(x[cond], y[cond], pch=20, cex=.5)
	lines(x[cond], y[cond], lwd=.5)
	points(x[cond][1], y[cond][1], pch=20, cex=1.5)
	}
	mtext(title, side=3, line=1, outer=TRUE, cex=.8)
	}

	delta_true <- 0.05
	stan_data <- list(delta=delta_true, N=nrow(abbgh), x=abbgh$Lc, y=log(abbgh$patcw + 1), J1=n_industries, J2=n_years, group1=industries_in_order, group2=years_in_order)
	fit_11 <- hinge_multilevel$sample(stan_data, parallel_chains=4, refresh=0)
	print(fit_11)
	sims_11 <- fit_11$draws(format="df")
	print(mean(sims_11$b1 < 0))

	pdf("invertedu_5.pdf", height=5, width=7)
	plot_grid_hinge(fit_11, title="Hinge regression with uncertainty on log scale, adjusting for industry and year")
	dev.off()

	plot_grid_quadratic <- function(fit, n_rows=4, n_cols=5, title){
	par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02)
	x <- abbgh$Lc
	y <- log(abbgh$patcw + 1)
	sims <- fit$draws(format="df")
	b0_hat <- median(sims$b0)
	b1_hat <- median(sims$b1)
	b2_hat <- median(sims$b2)
	n_sims <- nrow(sims)
	for (i in 1:n_industries) {
	j <- industries_sort[i]
	grid_left <- i%%n_cols == 1
	grid_bottom <- i > n_industries - n_cols
	plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) "log (Wt. Patents + 1)" else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8)
	a1_sims <- as.matrix(fit$draws("a1", format="df"))[,1:n_industries]
	a2_sims <- as.matrix(fit$draws("a2", format="df"))[,1:n_years]
	for (s in sample(n_sims, 20)) {
	curve(sims$b0[s] + sims$b1[s]x + sims$b2[s]x^2 + a1_sims[s,j] + mean(a2_sims[s,]), col="red", lwd=0.5, add=TRUE)
	}
	a1_hat <- apply(a1_sims, 2, median)
	a2_hat <- apply(a1_sims, 2, median)
	curve(b0_hat + b1_hatx + b2_hatx^2 + a1_hat[j] + mean(a2_hat), col="blue", lwd=1.5, add=TRUE)
	cond <- abbgh$sic2==industries[j]
	points(x[cond], y[cond], pch=20, cex=.5)
	lines(x[cond], y[cond], lwd=.5)
	points(x[cond][1], y[cond][1], pch=20, cex=1.5)
	}
	mtext(title, side=3, line=1, outer=TRUE, cex=.8)
	}

	fit_12 <- quadratic_multilevel$sample(stan_data, parallel_chains=4, refresh=0)
	print(fit_12)
	sims_12 <- fit_12$draws(format="df")
	x_peak <- -sims_12$b1/(2*sims_12$b2)
	print(mean(x_peak > min(abbgh$Lc) & x_peak < max(abbgh$Lc)))

	pdf("invertedu_6.pdf", height=5, width=7)
	plot_grid_quadratic(fit_12, title="Quadratic regression with uncertainty on log scale, adjusting for industry and year")
	dev.off()
No results found