Skip to content

Instantly share code, notes, and snippets.

@explodecomputer
Created December 19, 2025 11:28
Show Gist options
  • Select an option

  • Save explodecomputer/06806d6d7eb10674ff57b0779a4f906c to your computer and use it in GitHub Desktop.

Select an option

Save explodecomputer/06806d6d7eb10674ff57b0779a4f906c to your computer and use it in GitHub Desktop.
Factoral MR survival power sim
library(ggplot2)
library(dplyr)
library(simsurv)
library(boot)
# Manually estimating power
# Suppose I want to know the power of a linear regression model through simulation:
set.seed(123)
# Generate simulated data to match what I see in my real analysis
# Here we have a variable x and y, both have variance of 1
# There is some sample size n
# But I don't know the effect size of x on y
# I will be varying the effect size (beta) to see how it affects power
dgm <- function(n = 30, beta = 0.2) {
x <- rnorm(n)
# We want the variance of y to be 1, so we just add enough noise on top of x*beta
y <- beta * x + rnorm(n, sd = sqrt(1 - beta^2))
tibble(x = x, y = y)
}
# Now that I've generated the data to mimic my real analysis, I need
# to estimate the effect of x on y in the same way that I would in the real analysis
estimation <- function(data) {
model <- lm(y ~ x, data = data)
summary(model)$coefficients[2, 4] # p-value for the slope
}
# Finally, I need to run this across lots of different potential values of
# beta and n, to see the average power
# Power depends on the significance threshold (alpha).
power_simulation <- function(nsim = 1000, n = c(30, 50, 70), beta = c(0, 0.2, 0.4), alpha = 0.05) {
params <- expand.grid(n = n, beta = beta, sim=1:nsim)
for(i in 1:nrow(params)) {
data <- dgm(n = params$n[i], beta = params$beta[i])
params$p_value[i] <- estimation(data)
}
# Count how many significant results we get for each combination of n and beta
# Divide by nsim to get the proportion of significant results (i.e., power)
params <- params %>%
group_by(n, beta) %>%
summarise(power = mean(p_value < alpha)) %>%
ungroup()
return(params)
}
# Try out the dgm function
data <- dgm(n = 5000, beta = 0.3)
# Try out the estimation function
estimation(data)
# Run the power simulation
results <- power_simulation(nsim = 1000, n = c(30, 50, 70), beta = c(0, 0.2, 0.4, 0.6), alpha = 0.05)
# plot the power results
ggplot(results, aes(x = beta, y = power, color = factor(n))) +
geom_line() +
geom_point() +
labs(title = "Power Simulation Results",
x = "Effect Size (Beta)",
y = "Power",
color = "Sample Size (n)")
# From this we can see the basic framework of a power simulation.
# Now we have to adapt it for factorial MR in cancer survival.
# That means the dgm function needs to generate
# - genotype
# - proteomic
# - time to event
# And the estimation function needs to run a factorial MR analysis on time to event
# For the time to event part, we can simulate the data using the simsurv package. First generate the protein levels and their interaction
n <- 10000
g1 <- rbinom(n, 2, 0.3) # genotype for protein 1
g2 <- rbinom(n, 2, 0.4) # genotype for protein 2
u <- rnorm(n) # unmeasured confounder
pred1 <- 0.5 * g1 + 0.3 * u
pred2 <- 0.5 * g2 + 0.3 * u
covars <- tibble(protein1 = pred1 + rnorm(n, sd = sqrt(1 - var(pred1))),
protein2 = pred2 + rnorm(n, sd = sqrt(1 - var(pred2))),
interaction = protein1 * protein2)
# we'll need the shape and scale parameters for the Weibull distribution - you can get these from fitting a Weibull model to real data. e.g.
lambdas <- 1.5
gammas <- 0.01
# And now we select some hypothesised betas for the proteins and their interaction
betas <- c(protein1 = 0.3, protein2 = 0.4, interaction = 0.2)
# Finally we need a max time for the simulation. This will introduce right-censoring e.g.
maxt <- 5
simdata <- simsurv(lambdas = lambdas,
gammas = gammas,
betas = betas,
x = covars,
maxt = maxt)
head(simdata)
# For estimation part, if this was just an observational analysis we could use the survival package to fit a Cox proportional hazards model. For example:
library(survival)
cox_model <- coxph(Surv(eventtime, status) ~ protein1 + protein2 + interaction, data = merge(covars, simdata, by = "row.names"))
summary(cox_model)
# But as this is a factorial MR, we will need to use the genetic predictors of protein1, protein2 and the interaction in a two-stage residual inclusion model (2SRI).
# First stage: regress protein1 and protein2 on their respective genotypes to get predicted values
# Second stage: fit a Cox model using the predicted values and their interaction
d <- cbind(covars, simdata, g1, g2)
head(d)
m1 <- lm(protein1 ~ g1 + g2 + g1:g2, data = d)
d$res1 <- residuals(m1)
m2 <- lm(protein2 ~ g1 + g2 + g1:g2, data = d)
d$res2 <- residuals(m2)
head(d)
fit_cox <- coxph(Surv(eventtime, status) ~ protein1 * protein2 + res1 + res2, data = d)
summary(fit_cox)
# Problem is that this doesn't give us correct standard errors, so the p-values will be wrong
# Need to bootstrap the whole procedure to get a distribtion of betas
# Then use the SD(betas) as the standard error to get the p-values
iv_coxph <- function(data, ind=1:nrow(data)) {
d <- data[ind, ]
m1 <- lm(protein1 ~ g1 + g2 + g1:g2, data = d)
d$res1 <- residuals(m1)
m2 <- lm(protein2 ~ g1 + g2 + g1:g2, data = d)
d$res2 <- residuals(m2)
fit_cox <- coxph(Surv(eventtime, status) ~ protein1 * protein2 + res1 + res2, data = d)
return(summary(fit_cox)$coefficients["protein1:protein2", 1]) # return the beta for the interaction term
}
iv_coxph(d)
estimation <- function(data) {
bootres <- boot(data = data, statistic = iv_coxph, R=100)
sd_boot <- sd(bootres$t)
pval <- 2 * (1 - pnorm(abs(bootres$t0 / sd_boot)))
return(pval)
}
# So overall the power simulation framework will be similar to the linear regression example, but with a more complex dgm and estimation function.
dgm <- function(n = 1000, freq1, freq2, rsq_g1p1, rsq_g2p2, beta_p1, beta_p2, beta_interaction, beta_u, lambdas, gammas, maxt) {
g1 <- rbinom(n, 2, freq1) # genotype for protein 1
g2 <- rbinom(n, 2, freq2) # genotype for protein 2
u <- rnorm(n) # unmeasured confounder that induces correlation between protein 1 and 2
pred1 <- scale(g1) * rsq_g1p1 + beta_u * u
pred2 <- scale(g2) * rsq_g2p2 + beta_u * u
covars <- tibble(protein1 = pred1 + rnorm(n, sd = sqrt(1 - var(pred1))),
protein2 = pred2 + rnorm(n, sd = sqrt(1 - var(pred2))),
interaction = protein1 * protein2)
betas <- c(protein1 = beta_p1, protein2 = beta_p2, interaction = beta_interaction)
simdata <- simsurv(lambdas = lambdas,
gammas = gammas,
betas = betas,
x = covars,
maxt = maxt)
d <- cbind(covars, simdata, g1, g2) %>% as_tibble()
return(d)
}
d <- dgm(n = 1000, freq1 = 0.3, freq2 = 0.4, rsq_g1p1 = 0.1, rsq_g2p2 = 0.1,
beta_p1 = 0.3, beta_p2 = 0.4, beta_interaction = 0.2, beta_u = 0.3,
lambdas = 1.5, gammas = 0.01, maxt = 5)
estimation(d)
# And wrap this into the power simulation function as before
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment