Last active
October 23, 2025 05:40
-
-
Save SermetPekin/5814725112e09df1388e3bc2851de379 to your computer and use it in GitHub Desktop.
Fully reproducible R and Stan script for Andrew Gelman’s “Reanalysis of that Nobel prize–winning study of patents and innovation”
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Reproduction of Andrew Gelman's blog post: | |
| # "Reanalysis of that Nobel prizewinning study of patents and innovation (with R and Stan code)" | |
| # https://statmodeling.stat.columbia.edu/2025/10/21/reanalysis-of-that-nobel-prizewinning-study-of-patents-and-innovation/ | |
| # This script installs packages, downloads data, compiles Stan models, and reproduces plots in one run. | |
| # ---- install-and-load ------------------------------------------------------ | |
| # Packages you need | |
| pkgs <- c("haven", "arm", "rstanarm", "MASS", "cmdstanr") | |
| # Ensure pak is available (fast, respects binaries) | |
| if (!requireNamespace("pak", quietly = TRUE)) { | |
| install.packages("pak") | |
| } | |
| # Install any missing packages | |
| missing <- setdiff(pkgs, rownames(installed.packages())) | |
| if (length(missing)) { | |
| pak::pak(missing) | |
| } | |
| # Load packages quietly (stop with a clear message if something fails) | |
| for (p in pkgs) { | |
| suppressPackageStartupMessages( | |
| library(p, character.only = TRUE) | |
| ) | |
| } | |
| # OPTIONAL: One-time CmdStan toolchain install (comment out if not desired) | |
| if (isTRUE(requireNamespace("cmdstanr", quietly = TRUE))) { | |
| cmdstan_path <- tryCatch(cmdstanr::cmdstan_path(), error = function(e) NA) | |
| if (is.na(cmdstan_path) || !dir.exists(cmdstan_path)) { | |
| message("Installing CmdStan (one-time build)…") | |
| cmdstanr::install_cmdstan() # may take several minutes the first time | |
| } | |
| } | |
| hinge_stan <- " | |
| functions { | |
| vector logistic_hinge(vector x, real x0, real a, real b0, real b1, real delta) { | |
| vector[size(x)] xdiff = x - x0; | |
| return a + b0 * xdiff + (b1 - b0) * delta * log1p_exp(xdiff / delta); | |
| } | |
| } | |
| data { | |
| real<lower=0> delta; | |
| int N; | |
| vector[N] x, y; | |
| int J1, J2; | |
| array[N] int<lower=1,upper=J1> group1; | |
| array[N] int<lower=1,upper=J2> group2; | |
| } | |
| parameters { | |
| real x0; | |
| real a, b0, b1; | |
| real<lower=0> sigma, sigma1, sigma2; | |
| vector<offset=0, multiplier=sigma1>[J1] a1; | |
| vector<offset=0, multiplier=sigma2>[J2] a2; | |
| } | |
| model { | |
| x0 ~ normal(1, 1); | |
| a ~ normal(0, 100); | |
| b0 ~ normal(0, 100); | |
| b1 ~ normal(0, 100); | |
| a1 ~ normal(0, sigma1); | |
| a2 ~ normal(0, sigma2); | |
| y ~ normal(logistic_hinge(x, x0, a, b0, b1, delta) + a1[group1] + a2[group2], sigma); | |
| } | |
| " | |
| quadratic_stan <- " | |
| data { | |
| int N; | |
| vector[N] x, y; | |
| int J1, J2; | |
| array[N] int<lower=1,upper=J1> group1; | |
| array[N] int<lower=1,upper=J2> group2; | |
| } | |
| parameters { | |
| real b0, b1, b2; | |
| real<lower=0> sigma, sigma1, sigma2; | |
| vector<offset=0, multiplier=sigma1>[J1] a1; | |
| vector<offset=0, multiplier=sigma2>[J2] a2; | |
| } | |
| model { | |
| b0 ~ normal(0, 100); | |
| b1 ~ normal(0, 100); | |
| b2 ~ normal(0, 100); | |
| a1 ~ normal(0, sigma1); | |
| a2 ~ normal(0, sigma2); | |
| y ~ normal(b0 + b1*x + b2*x^2 + a1[group1] + a2[group2], sigma); | |
| } | |
| " | |
| # ---- 1) Helper: write a .stan file from a string, idempotently ------------ | |
| write_stan <- function(path, code) { | |
| dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE) | |
| if (!file.exists(path) || !identical(readLines(path, warn = FALSE), strsplit(code, "\n", fixed = TRUE)[[1]])) { | |
| writeLines(code, path, useBytes = TRUE) | |
| } | |
| normalizePath(path, winslash = "/", mustWork = TRUE) | |
| } | |
| stan_dir <- "stan" | |
| hinge_file <- write_stan(file.path(stan_dir, "hinge_multilevel.stan"), hinge_stan) | |
| quad_file <- write_stan(file.path(stan_dir, "quadratic_multilevel.stan"), quadratic_stan) | |
| hinge_multilevel <- cmdstan_model(hinge_file) | |
| quadratic_multilevel <- cmdstan_model(quad_file) | |
| set.seed(123) | |
| # ---- Data download & read ---- | |
| url <- "https://sites.stat.columbia.edu/gelman/stuff_for_blog/abbgh_data.dta" | |
| local_path <- "data/abbgh_data.dta" | |
| # Create data folder if missing | |
| if (!dir.exists("data")) dir.create("data", recursive = TRUE) | |
| # Download only if not already saved | |
| if (!file.exists(local_path)) { | |
| download.file(url, destfile = local_path, mode = "wb") | |
| } | |
| # Read | |
| abbgh <- haven::read_dta(local_path) | |
| abbgh$patcw_rounded <- round(abbgh$patcw) | |
| industry_names <- c("Textiles", "Apparel", "Lumber", "Furniture", | |
| "Leather", "Stone, glass, concrete", "Primary metal", "Fabricated medal", "Machinery and computers", "Electric, not computers", "Transport equipment", "Passenger transit", "Freight transport", "Postal service", "Transport services", "Communications", "Electric, gas services") | |
| add_curve <- function(fit, multilevel=FALSE, logscale=FALSE, shift=0, ...) { | |
| if (multilevel) { | |
| beta_hat <- fixef(fit)[1:3] | |
| } | |
| else { | |
| beta_hat <- coef(fit)[1:3] | |
| } | |
| if (logscale) | |
| curve(shift + beta_hat[1] + beta_hat[2]*x + beta_hat[3]*x^2, ..., add=TRUE) | |
| else | |
| curve(exp(shift + beta_hat[1] + beta_hat[2]*x + beta_hat[3]*x^2), ..., add=TRUE) | |
| } | |
| pdf("invertedu_1.pdf", height=4, width=4) | |
| par(mfrow=c(1,1), oma=c(0,0,2,0), mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01) | |
| plot(abbgh$Lc, abbgh$patcw, xlab='"Competitiveness"', ylab="Weighted Patents", xlim=c(0.85, 1), pch=20, cex=.5, bty="l") | |
| fit_1 <- glm(patcw_rounded ~ Lc + I(Lc^2), family=poisson(link="log"), data=abbgh) | |
| display(fit_1) | |
| add_curve(fit_1, col="green") | |
| fit_2 <- glm(patcw_rounded ~ Lc + I(Lc^2), family=quasipoisson(link="log"), data=abbgh) | |
| display(fit_2) | |
| add_curve(fit_2, col="red") | |
| fit_3 <- glm(patcw ~ Lc + I(Lc^2), family=quasipoisson(link="log"), data=abbgh) | |
| display(fit_3) | |
| add_curve(fit_3, col="blue") | |
| fit_4 <- glm.nb(patcw_rounded ~ Lc + I(Lc^2), data=abbgh) | |
| summary(fit_4) | |
| add_curve(fit_4, col="pink") | |
| fit_5 <- stan_glm(patcw_rounded ~ Lc + I(Lc^2), family=neg_binomial_2(link="log"), data=abbgh, refresh=0) | |
| print(fit_5) | |
| add_curve(fit_5, col="orange") | |
| mtext("Quadratic regressions on original scale", side=3, line=0, outer=TRUE, cex=.9) | |
| dev.off() | |
| # The orange curve looks different from the others. | |
| # Check by re-running the Bayesian negative binomial regression on optimize setting | |
| fit_5_opt <- stan_glm(patcw_rounded ~ Lc + I(Lc^2), family=neg_binomial_2(link="log"), data=abbgh, algorithm="optimizing") | |
| print(fit_5_opt) | |
| adjust <- function(fit, data=abbgh, multilevel=FALSE) { | |
| if (multilevel) { | |
| sic2_effects <- ranef(fit)$sic2[,1] | |
| year_effects <- ranef(fit)$year[,1] | |
| } | |
| else { | |
| sic2_effects <- c(0, coef(fit)[grep("sic2", names(coef(fit)))]) | |
| year_effects <- c(0, coef(fit)[grep("year", names(coef(fit)))]) | |
| } | |
| sic2_effects + mean(year_effects) | |
| } | |
| fit_3a <- glm(patcw ~ Lc + I(Lc^2) + factor(sic2) + factor(year), family=quasipoisson(link="log"), data=abbgh) | |
| display(fit_3a) | |
| fit_4a <- glm.nb(patcw_rounded ~ Lc + I(Lc^2) + factor(sic2) + factor(year), data=abbgh) | |
| summary(fit_4a) | |
| fit_5a <- stan_glm(patcw_rounded ~ Lc + I(Lc^2) + factor(sic2) + factor(year), family=neg_binomial_2(link="log"), data=abbgh, refresh=0) | |
| print(fit_5a) | |
| fit_6a <- stan_glmer(patcw_rounded ~ Lc + I(Lc^2) + (1 | sic2) + (1 | year), family=neg_binomial_2(link="log"), data=abbgh, refresh=0) | |
| print(fit_6a) | |
| industries <- unique(abbgh$sic2) | |
| n_industries <- length(industries) | |
| avg_patcw <- rep(NA, n_industries) | |
| avg_Lc <- rep(NA, n_industries) | |
| count <- 0 | |
| for (j in industries) { | |
| count <- count + 1 | |
| cond <- abbgh$sic2==j | |
| avg_patcw[count] <- mean(abbgh$patcw[cond]) | |
| avg_Lc[count] <- mean(abbgh$Lc[cond]) | |
| } | |
| industries_sort <- rev(order(avg_patcw)) | |
| plot_grid <- function(fits, multilevel, logscale, colors, n_rows=4, n_cols=5, title){ | |
| par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02) | |
| x <- abbgh$Lc | |
| y <- if (logscale) log(abbgh$patcw + 1) else abbgh$patcw | |
| y_label <- if (logscale) "log (Wt Patents + 1)" else "Weighted Patents" | |
| for (i in 1:n_industries) { | |
| j <- industries_sort[i] | |
| grid_left <- i%%n_cols == 1 | |
| grid_bottom <- i > n_industries - n_cols | |
| plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) y_label else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8) | |
| for (k in 1:length(fits)) { | |
| add_curve(fits[[k]], multilevel=multilevel[k], logscale=logscale, | |
| shift=adjust(fits[[k]],multilevel=multilevel[k])[j], col=colors[k]) | |
| } | |
| cond <- abbgh$sic2==industries[j] | |
| points(x[cond], y[cond], pch=20, cex=.5) | |
| lines(x[cond], y[cond], lwd=.5) | |
| points(x[cond][1], y[cond][1], pch=20, cex=1.5) | |
| } | |
| mtext(title, side=3, line=1, outer=TRUE, cex=.8) | |
| } | |
| pdf("invertedu_2.pdf", height=5, width=7) | |
| plot_grid(list(fit_3a, fit_4a, fit_5a, fit_6a), multilevel=c(FALSE, FALSE, FALSE, TRUE), logscale=FALSE, colors=c("blue","pink","orange","purple"), title="Quadratic regressions on original scale, adjusting for industry and year") | |
| dev.off() | |
| abbgh$log_patcw <- log(abbgh$patcw + 1) | |
| pdf("invertedu_3.pdf", height=4, width=4) | |
| par(mfrow=c(1,1), oma=c(0,0,2,0), mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01) | |
| plot(abbgh$Lc, abbgh$log_patcw, xlab='"Competitiveness"', ylab="log (Wt Patents + 1)", xlim=c(0.85, 1), pch=20, cex=.5, bty="l") | |
| fit_7 <- lm(log_patcw ~ Lc + I(Lc^2), data=abbgh) | |
| display(fit_7) | |
| add_curve(fit_7, logscale="TRUE", col="green") | |
| mtext("Quadratic regressions on log scale", side=3, line=0, outer=TRUE, cex=.9) | |
| dev.off() | |
| fit_8 <- lm(log_patcw ~ Lc + I(Lc^2) + factor(sic2) + factor(year), data=abbgh) | |
| summary(fit_8) | |
| fit_9 <- stan_glmer(log_patcw ~ Lc + I(Lc^2) + (1 | sic2) + (1 | year), data=abbgh, refresh=0) | |
| print(fit_9) | |
| pdf("invertedu_4.pdf", height=5, width=7) | |
| plot_grid(list(fit_8, fit_9), multilevel=c(FALSE, TRUE), logscale=TRUE, colors=c("orange","purple"), | |
| title="Quadratic regressions on log scale, adjusting for industry and year") | |
| dev.off() | |
| industries_in_order <- rep(NA, n_industries) | |
| for (j in 1:n_industries) { | |
| industries_in_order[abbgh$sic2==industries[j]] <- j | |
| } | |
| abbgh$avg_Lc <- avg_Lc[industries_in_order] | |
| years <- unique(abbgh$year) | |
| n_years <- length(years) | |
| years_in_order <- rep(NA, n_years) | |
| for (j in 1:n_years) { | |
| years_in_order[abbgh$year==years[j]] <- j | |
| } | |
| fit_10 <- stan_glmer(log_patcw ~ Lc + I(Lc^2) + avg_Lc + (1 | sic2) + (1 | year), data=abbgh, refresh=0) | |
| print(fit_10) | |
| logistic_hinge <- function(x, x0, a, b0, b1, delta) { | |
| return(a + b0*(x-x0) + (b1-b0)*delta * log(1 + exp((x-x0)/delta))) | |
| } | |
| plot_grid_hinge <- function(fit, n_rows=4, n_cols=5, title){ | |
| par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02) | |
| x <- abbgh$Lc | |
| y <- log(abbgh$patcw + 1) | |
| sims <- fit$draws(format="df") | |
| x0_hat <- median(sims$x0) | |
| a_hat <- median(sims$a) | |
| b0_hat <- median(sims$b0) | |
| b1_hat <- median(sims$b1) | |
| n_sims <- nrow(sims) | |
| for (i in 1:n_industries) { | |
| j <- industries_sort[i] | |
| grid_left <- i%%n_cols == 1 | |
| grid_bottom <- i > n_industries - n_cols | |
| plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) "log (Wt. Patents + 1)" else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8) | |
| a1_sims <- as.matrix(fit$draws("a1", format="df"))[,1:n_industries] | |
| a2_sims <- as.matrix(fit$draws("a2", format="df"))[,1:n_years] | |
| for (s in sample(n_sims, 20)) { | |
| curve(logistic_hinge(x, sims$x0[s], sims$a[s], sims$b0[s], sims$b1[s], delta_true) + a1_sims[s,j] + mean(a2_sims[s,]), col="red", lwd=0.5, add=TRUE) | |
| } | |
| a1_hat <- apply(a1_sims, 2, median) | |
| a2_hat <- apply(a1_sims, 2, median) | |
| curve(logistic_hinge(x, x0_hat, a_hat, b0_hat, b1_hat, delta_true) + a1_hat[j] + mean(a2_hat), col="blue", lwd=1.5, add=TRUE) | |
| cond <- abbgh$sic2==industries[j] | |
| points(x[cond], y[cond], pch=20, cex=.5) | |
| lines(x[cond], y[cond], lwd=.5) | |
| points(x[cond][1], y[cond][1], pch=20, cex=1.5) | |
| } | |
| mtext(title, side=3, line=1, outer=TRUE, cex=.8) | |
| } | |
| delta_true <- 0.05 | |
| stan_data <- list(delta=delta_true, N=nrow(abbgh), x=abbgh$Lc, y=log(abbgh$patcw + 1), J1=n_industries, J2=n_years, group1=industries_in_order, group2=years_in_order) | |
| fit_11 <- hinge_multilevel$sample(stan_data, parallel_chains=4, refresh=0) | |
| print(fit_11) | |
| sims_11 <- fit_11$draws(format="df") | |
| print(mean(sims_11$b1 < 0)) | |
| pdf("invertedu_5.pdf", height=5, width=7) | |
| plot_grid_hinge(fit_11, title="Hinge regression with uncertainty on log scale, adjusting for industry and year") | |
| dev.off() | |
| plot_grid_quadratic <- function(fit, n_rows=4, n_cols=5, title){ | |
| par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02) | |
| x <- abbgh$Lc | |
| y <- log(abbgh$patcw + 1) | |
| sims <- fit$draws(format="df") | |
| b0_hat <- median(sims$b0) | |
| b1_hat <- median(sims$b1) | |
| b2_hat <- median(sims$b2) | |
| n_sims <- nrow(sims) | |
| for (i in 1:n_industries) { | |
| j <- industries_sort[i] | |
| grid_left <- i%%n_cols == 1 | |
| grid_bottom <- i > n_industries - n_cols | |
| plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) "log (Wt. Patents + 1)" else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8) | |
| a1_sims <- as.matrix(fit$draws("a1", format="df"))[,1:n_industries] | |
| a2_sims <- as.matrix(fit$draws("a2", format="df"))[,1:n_years] | |
| for (s in sample(n_sims, 20)) { | |
| curve(sims$b0[s] + sims$b1[s]*x + sims$b2[s]*x^2 + a1_sims[s,j] + mean(a2_sims[s,]), col="red", lwd=0.5, add=TRUE) | |
| } | |
| a1_hat <- apply(a1_sims, 2, median) | |
| a2_hat <- apply(a1_sims, 2, median) | |
| curve(b0_hat + b1_hat*x + b2_hat*x^2 + a1_hat[j] + mean(a2_hat), col="blue", lwd=1.5, add=TRUE) | |
| cond <- abbgh$sic2==industries[j] | |
| points(x[cond], y[cond], pch=20, cex=.5) | |
| lines(x[cond], y[cond], lwd=.5) | |
| points(x[cond][1], y[cond][1], pch=20, cex=1.5) | |
| } | |
| mtext(title, side=3, line=1, outer=TRUE, cex=.8) | |
| } | |
| fit_12 <- quadratic_multilevel$sample(stan_data, parallel_chains=4, refresh=0) | |
| print(fit_12) | |
| sims_12 <- fit_12$draws(format="df") | |
| x_peak <- -sims_12$b1/(2*sims_12$b2) | |
| print(mean(x_peak > min(abbgh$Lc) & x_peak < max(abbgh$Lc))) | |
| pdf("invertedu_6.pdf", height=5, width=7) | |
| plot_grid_quadratic(fit_12, title="Quadratic regression with uncertainty on log scale, adjusting for industry and year") | |
| dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment