Skip to content

Instantly share code, notes, and snippets.

@SermetPekin
Last active October 23, 2025 05:40
Show Gist options
  • Select an option

  • Save SermetPekin/5814725112e09df1388e3bc2851de379 to your computer and use it in GitHub Desktop.

Select an option

Save SermetPekin/5814725112e09df1388e3bc2851de379 to your computer and use it in GitHub Desktop.
Fully reproducible R and Stan script for Andrew Gelman’s “Reanalysis of that Nobel prize–winning study of patents and innovation”
# Reproduction of Andrew Gelman's blog post:
# "Reanalysis of that Nobel prizewinning study of patents and innovation (with R and Stan code)"
# https://statmodeling.stat.columbia.edu/2025/10/21/reanalysis-of-that-nobel-prizewinning-study-of-patents-and-innovation/
# This script installs packages, downloads data, compiles Stan models, and reproduces plots in one run.
# ---- install-and-load ------------------------------------------------------
# Packages you need
pkgs <- c("haven", "arm", "rstanarm", "MASS", "cmdstanr")
# Ensure pak is available (fast, respects binaries)
if (!requireNamespace("pak", quietly = TRUE)) {
install.packages("pak")
}
# Install any missing packages
missing <- setdiff(pkgs, rownames(installed.packages()))
if (length(missing)) {
pak::pak(missing)
}
# Load packages quietly (stop with a clear message if something fails)
for (p in pkgs) {
suppressPackageStartupMessages(
library(p, character.only = TRUE)
)
}
# OPTIONAL: One-time CmdStan toolchain install (comment out if not desired)
if (isTRUE(requireNamespace("cmdstanr", quietly = TRUE))) {
cmdstan_path <- tryCatch(cmdstanr::cmdstan_path(), error = function(e) NA)
if (is.na(cmdstan_path) || !dir.exists(cmdstan_path)) {
message("Installing CmdStan (one-time build)…")
cmdstanr::install_cmdstan() # may take several minutes the first time
}
}
hinge_stan <- "
functions {
vector logistic_hinge(vector x, real x0, real a, real b0, real b1, real delta) {
vector[size(x)] xdiff = x - x0;
return a + b0 * xdiff + (b1 - b0) * delta * log1p_exp(xdiff / delta);
}
}
data {
real<lower=0> delta;
int N;
vector[N] x, y;
int J1, J2;
array[N] int<lower=1,upper=J1> group1;
array[N] int<lower=1,upper=J2> group2;
}
parameters {
real x0;
real a, b0, b1;
real<lower=0> sigma, sigma1, sigma2;
vector<offset=0, multiplier=sigma1>[J1] a1;
vector<offset=0, multiplier=sigma2>[J2] a2;
}
model {
x0 ~ normal(1, 1);
a ~ normal(0, 100);
b0 ~ normal(0, 100);
b1 ~ normal(0, 100);
a1 ~ normal(0, sigma1);
a2 ~ normal(0, sigma2);
y ~ normal(logistic_hinge(x, x0, a, b0, b1, delta) + a1[group1] + a2[group2], sigma);
}
"
quadratic_stan <- "
data {
int N;
vector[N] x, y;
int J1, J2;
array[N] int<lower=1,upper=J1> group1;
array[N] int<lower=1,upper=J2> group2;
}
parameters {
real b0, b1, b2;
real<lower=0> sigma, sigma1, sigma2;
vector<offset=0, multiplier=sigma1>[J1] a1;
vector<offset=0, multiplier=sigma2>[J2] a2;
}
model {
b0 ~ normal(0, 100);
b1 ~ normal(0, 100);
b2 ~ normal(0, 100);
a1 ~ normal(0, sigma1);
a2 ~ normal(0, sigma2);
y ~ normal(b0 + b1*x + b2*x^2 + a1[group1] + a2[group2], sigma);
}
"
# ---- 1) Helper: write a .stan file from a string, idempotently ------------
write_stan <- function(path, code) {
dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE)
if (!file.exists(path) || !identical(readLines(path, warn = FALSE), strsplit(code, "\n", fixed = TRUE)[[1]])) {
writeLines(code, path, useBytes = TRUE)
}
normalizePath(path, winslash = "/", mustWork = TRUE)
}
stan_dir <- "stan"
hinge_file <- write_stan(file.path(stan_dir, "hinge_multilevel.stan"), hinge_stan)
quad_file <- write_stan(file.path(stan_dir, "quadratic_multilevel.stan"), quadratic_stan)
hinge_multilevel <- cmdstan_model(hinge_file)
quadratic_multilevel <- cmdstan_model(quad_file)
set.seed(123)
# ---- Data download & read ----
url <- "https://sites.stat.columbia.edu/gelman/stuff_for_blog/abbgh_data.dta"
local_path <- "data/abbgh_data.dta"
# Create data folder if missing
if (!dir.exists("data")) dir.create("data", recursive = TRUE)
# Download only if not already saved
if (!file.exists(local_path)) {
download.file(url, destfile = local_path, mode = "wb")
}
# Read
abbgh <- haven::read_dta(local_path)
abbgh$patcw_rounded <- round(abbgh$patcw)
industry_names <- c("Textiles", "Apparel", "Lumber", "Furniture",
"Leather", "Stone, glass, concrete", "Primary metal", "Fabricated medal", "Machinery and computers", "Electric, not computers", "Transport equipment", "Passenger transit", "Freight transport", "Postal service", "Transport services", "Communications", "Electric, gas services")
add_curve <- function(fit, multilevel=FALSE, logscale=FALSE, shift=0, ...) {
if (multilevel) {
beta_hat <- fixef(fit)[1:3]
}
else {
beta_hat <- coef(fit)[1:3]
}
if (logscale)
curve(shift + beta_hat[1] + beta_hat[2]*x + beta_hat[3]*x^2, ..., add=TRUE)
else
curve(exp(shift + beta_hat[1] + beta_hat[2]*x + beta_hat[3]*x^2), ..., add=TRUE)
}
pdf("invertedu_1.pdf", height=4, width=4)
par(mfrow=c(1,1), oma=c(0,0,2,0), mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
plot(abbgh$Lc, abbgh$patcw, xlab='"Competitiveness"', ylab="Weighted Patents", xlim=c(0.85, 1), pch=20, cex=.5, bty="l")
fit_1 <- glm(patcw_rounded ~ Lc + I(Lc^2), family=poisson(link="log"), data=abbgh)
display(fit_1)
add_curve(fit_1, col="green")
fit_2 <- glm(patcw_rounded ~ Lc + I(Lc^2), family=quasipoisson(link="log"), data=abbgh)
display(fit_2)
add_curve(fit_2, col="red")
fit_3 <- glm(patcw ~ Lc + I(Lc^2), family=quasipoisson(link="log"), data=abbgh)
display(fit_3)
add_curve(fit_3, col="blue")
fit_4 <- glm.nb(patcw_rounded ~ Lc + I(Lc^2), data=abbgh)
summary(fit_4)
add_curve(fit_4, col="pink")
fit_5 <- stan_glm(patcw_rounded ~ Lc + I(Lc^2), family=neg_binomial_2(link="log"), data=abbgh, refresh=0)
print(fit_5)
add_curve(fit_5, col="orange")
mtext("Quadratic regressions on original scale", side=3, line=0, outer=TRUE, cex=.9)
dev.off()
# The orange curve looks different from the others.
# Check by re-running the Bayesian negative binomial regression on optimize setting
fit_5_opt <- stan_glm(patcw_rounded ~ Lc + I(Lc^2), family=neg_binomial_2(link="log"), data=abbgh, algorithm="optimizing")
print(fit_5_opt)
adjust <- function(fit, data=abbgh, multilevel=FALSE) {
if (multilevel) {
sic2_effects <- ranef(fit)$sic2[,1]
year_effects <- ranef(fit)$year[,1]
}
else {
sic2_effects <- c(0, coef(fit)[grep("sic2", names(coef(fit)))])
year_effects <- c(0, coef(fit)[grep("year", names(coef(fit)))])
}
sic2_effects + mean(year_effects)
}
fit_3a <- glm(patcw ~ Lc + I(Lc^2) + factor(sic2) + factor(year), family=quasipoisson(link="log"), data=abbgh)
display(fit_3a)
fit_4a <- glm.nb(patcw_rounded ~ Lc + I(Lc^2) + factor(sic2) + factor(year), data=abbgh)
summary(fit_4a)
fit_5a <- stan_glm(patcw_rounded ~ Lc + I(Lc^2) + factor(sic2) + factor(year), family=neg_binomial_2(link="log"), data=abbgh, refresh=0)
print(fit_5a)
fit_6a <- stan_glmer(patcw_rounded ~ Lc + I(Lc^2) + (1 | sic2) + (1 | year), family=neg_binomial_2(link="log"), data=abbgh, refresh=0)
print(fit_6a)
industries <- unique(abbgh$sic2)
n_industries <- length(industries)
avg_patcw <- rep(NA, n_industries)
avg_Lc <- rep(NA, n_industries)
count <- 0
for (j in industries) {
count <- count + 1
cond <- abbgh$sic2==j
avg_patcw[count] <- mean(abbgh$patcw[cond])
avg_Lc[count] <- mean(abbgh$Lc[cond])
}
industries_sort <- rev(order(avg_patcw))
plot_grid <- function(fits, multilevel, logscale, colors, n_rows=4, n_cols=5, title){
par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02)
x <- abbgh$Lc
y <- if (logscale) log(abbgh$patcw + 1) else abbgh$patcw
y_label <- if (logscale) "log (Wt Patents + 1)" else "Weighted Patents"
for (i in 1:n_industries) {
j <- industries_sort[i]
grid_left <- i%%n_cols == 1
grid_bottom <- i > n_industries - n_cols
plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) y_label else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8)
for (k in 1:length(fits)) {
add_curve(fits[[k]], multilevel=multilevel[k], logscale=logscale,
shift=adjust(fits[[k]],multilevel=multilevel[k])[j], col=colors[k])
}
cond <- abbgh$sic2==industries[j]
points(x[cond], y[cond], pch=20, cex=.5)
lines(x[cond], y[cond], lwd=.5)
points(x[cond][1], y[cond][1], pch=20, cex=1.5)
}
mtext(title, side=3, line=1, outer=TRUE, cex=.8)
}
pdf("invertedu_2.pdf", height=5, width=7)
plot_grid(list(fit_3a, fit_4a, fit_5a, fit_6a), multilevel=c(FALSE, FALSE, FALSE, TRUE), logscale=FALSE, colors=c("blue","pink","orange","purple"), title="Quadratic regressions on original scale, adjusting for industry and year")
dev.off()
abbgh$log_patcw <- log(abbgh$patcw + 1)
pdf("invertedu_3.pdf", height=4, width=4)
par(mfrow=c(1,1), oma=c(0,0,2,0), mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
plot(abbgh$Lc, abbgh$log_patcw, xlab='"Competitiveness"', ylab="log (Wt Patents + 1)", xlim=c(0.85, 1), pch=20, cex=.5, bty="l")
fit_7 <- lm(log_patcw ~ Lc + I(Lc^2), data=abbgh)
display(fit_7)
add_curve(fit_7, logscale="TRUE", col="green")
mtext("Quadratic regressions on log scale", side=3, line=0, outer=TRUE, cex=.9)
dev.off()
fit_8 <- lm(log_patcw ~ Lc + I(Lc^2) + factor(sic2) + factor(year), data=abbgh)
summary(fit_8)
fit_9 <- stan_glmer(log_patcw ~ Lc + I(Lc^2) + (1 | sic2) + (1 | year), data=abbgh, refresh=0)
print(fit_9)
pdf("invertedu_4.pdf", height=5, width=7)
plot_grid(list(fit_8, fit_9), multilevel=c(FALSE, TRUE), logscale=TRUE, colors=c("orange","purple"),
title="Quadratic regressions on log scale, adjusting for industry and year")
dev.off()
industries_in_order <- rep(NA, n_industries)
for (j in 1:n_industries) {
industries_in_order[abbgh$sic2==industries[j]] <- j
}
abbgh$avg_Lc <- avg_Lc[industries_in_order]
years <- unique(abbgh$year)
n_years <- length(years)
years_in_order <- rep(NA, n_years)
for (j in 1:n_years) {
years_in_order[abbgh$year==years[j]] <- j
}
fit_10 <- stan_glmer(log_patcw ~ Lc + I(Lc^2) + avg_Lc + (1 | sic2) + (1 | year), data=abbgh, refresh=0)
print(fit_10)
logistic_hinge <- function(x, x0, a, b0, b1, delta) {
return(a + b0*(x-x0) + (b1-b0)*delta * log(1 + exp((x-x0)/delta)))
}
plot_grid_hinge <- function(fit, n_rows=4, n_cols=5, title){
par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02)
x <- abbgh$Lc
y <- log(abbgh$patcw + 1)
sims <- fit$draws(format="df")
x0_hat <- median(sims$x0)
a_hat <- median(sims$a)
b0_hat <- median(sims$b0)
b1_hat <- median(sims$b1)
n_sims <- nrow(sims)
for (i in 1:n_industries) {
j <- industries_sort[i]
grid_left <- i%%n_cols == 1
grid_bottom <- i > n_industries - n_cols
plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) "log (Wt. Patents + 1)" else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8)
a1_sims <- as.matrix(fit$draws("a1", format="df"))[,1:n_industries]
a2_sims <- as.matrix(fit$draws("a2", format="df"))[,1:n_years]
for (s in sample(n_sims, 20)) {
curve(logistic_hinge(x, sims$x0[s], sims$a[s], sims$b0[s], sims$b1[s], delta_true) + a1_sims[s,j] + mean(a2_sims[s,]), col="red", lwd=0.5, add=TRUE)
}
a1_hat <- apply(a1_sims, 2, median)
a2_hat <- apply(a1_sims, 2, median)
curve(logistic_hinge(x, x0_hat, a_hat, b0_hat, b1_hat, delta_true) + a1_hat[j] + mean(a2_hat), col="blue", lwd=1.5, add=TRUE)
cond <- abbgh$sic2==industries[j]
points(x[cond], y[cond], pch=20, cex=.5)
lines(x[cond], y[cond], lwd=.5)
points(x[cond][1], y[cond][1], pch=20, cex=1.5)
}
mtext(title, side=3, line=1, outer=TRUE, cex=.8)
}
delta_true <- 0.05
stan_data <- list(delta=delta_true, N=nrow(abbgh), x=abbgh$Lc, y=log(abbgh$patcw + 1), J1=n_industries, J2=n_years, group1=industries_in_order, group2=years_in_order)
fit_11 <- hinge_multilevel$sample(stan_data, parallel_chains=4, refresh=0)
print(fit_11)
sims_11 <- fit_11$draws(format="df")
print(mean(sims_11$b1 < 0))
pdf("invertedu_5.pdf", height=5, width=7)
plot_grid_hinge(fit_11, title="Hinge regression with uncertainty on log scale, adjusting for industry and year")
dev.off()
plot_grid_quadratic <- function(fit, n_rows=4, n_cols=5, title){
par(mfrow=c(n_rows, n_cols), oma=c(0,0,3,0), mar=c(2.5,2,1,1), mgp=c(1.2,.2,0), tck=-.02)
x <- abbgh$Lc
y <- log(abbgh$patcw + 1)
sims <- fit$draws(format="df")
b0_hat <- median(sims$b0)
b1_hat <- median(sims$b1)
b2_hat <- median(sims$b2)
n_sims <- nrow(sims)
for (i in 1:n_industries) {
j <- industries_sort[i]
grid_left <- i%%n_cols == 1
grid_bottom <- i > n_industries - n_cols
plot(x, y, xlab = if (grid_bottom) '"Competitiveness"' else "", ylab = if (grid_left) "log (Wt. Patents + 1)" else "", xlim=c(0.85, 1), pch=20, cex=.2, bty="l", col="gray", main=industry_names[j], cex.main=.8, cex.lab=.8, cex.axis=.8)
a1_sims <- as.matrix(fit$draws("a1", format="df"))[,1:n_industries]
a2_sims <- as.matrix(fit$draws("a2", format="df"))[,1:n_years]
for (s in sample(n_sims, 20)) {
curve(sims$b0[s] + sims$b1[s]*x + sims$b2[s]*x^2 + a1_sims[s,j] + mean(a2_sims[s,]), col="red", lwd=0.5, add=TRUE)
}
a1_hat <- apply(a1_sims, 2, median)
a2_hat <- apply(a1_sims, 2, median)
curve(b0_hat + b1_hat*x + b2_hat*x^2 + a1_hat[j] + mean(a2_hat), col="blue", lwd=1.5, add=TRUE)
cond <- abbgh$sic2==industries[j]
points(x[cond], y[cond], pch=20, cex=.5)
lines(x[cond], y[cond], lwd=.5)
points(x[cond][1], y[cond][1], pch=20, cex=1.5)
}
mtext(title, side=3, line=1, outer=TRUE, cex=.8)
}
fit_12 <- quadratic_multilevel$sample(stan_data, parallel_chains=4, refresh=0)
print(fit_12)
sims_12 <- fit_12$draws(format="df")
x_peak <- -sims_12$b1/(2*sims_12$b2)
print(mean(x_peak > min(abbgh$Lc) & x_peak < max(abbgh$Lc)))
pdf("invertedu_6.pdf", height=5, width=7)
plot_grid_quadratic(fit_12, title="Quadratic regression with uncertainty on log scale, adjusting for industry and year")
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment