Created
December 19, 2025 11:28
-
-
Save explodecomputer/06806d6d7eb10674ff57b0779a4f906c to your computer and use it in GitHub Desktop.
Factoral MR survival power sim
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(ggplot2) | |
| library(dplyr) | |
| library(simsurv) | |
| library(boot) | |
| # Manually estimating power | |
| # Suppose I want to know the power of a linear regression model through simulation: | |
| set.seed(123) | |
| # Generate simulated data to match what I see in my real analysis | |
| # Here we have a variable x and y, both have variance of 1 | |
| # There is some sample size n | |
| # But I don't know the effect size of x on y | |
| # I will be varying the effect size (beta) to see how it affects power | |
| dgm <- function(n = 30, beta = 0.2) { | |
| x <- rnorm(n) | |
| # We want the variance of y to be 1, so we just add enough noise on top of x*beta | |
| y <- beta * x + rnorm(n, sd = sqrt(1 - beta^2)) | |
| tibble(x = x, y = y) | |
| } | |
| # Now that I've generated the data to mimic my real analysis, I need | |
| # to estimate the effect of x on y in the same way that I would in the real analysis | |
| estimation <- function(data) { | |
| model <- lm(y ~ x, data = data) | |
| summary(model)$coefficients[2, 4] # p-value for the slope | |
| } | |
| # Finally, I need to run this across lots of different potential values of | |
| # beta and n, to see the average power | |
| # Power depends on the significance threshold (alpha). | |
| power_simulation <- function(nsim = 1000, n = c(30, 50, 70), beta = c(0, 0.2, 0.4), alpha = 0.05) { | |
| params <- expand.grid(n = n, beta = beta, sim=1:nsim) | |
| for(i in 1:nrow(params)) { | |
| data <- dgm(n = params$n[i], beta = params$beta[i]) | |
| params$p_value[i] <- estimation(data) | |
| } | |
| # Count how many significant results we get for each combination of n and beta | |
| # Divide by nsim to get the proportion of significant results (i.e., power) | |
| params <- params %>% | |
| group_by(n, beta) %>% | |
| summarise(power = mean(p_value < alpha)) %>% | |
| ungroup() | |
| return(params) | |
| } | |
| # Try out the dgm function | |
| data <- dgm(n = 5000, beta = 0.3) | |
| # Try out the estimation function | |
| estimation(data) | |
| # Run the power simulation | |
| results <- power_simulation(nsim = 1000, n = c(30, 50, 70), beta = c(0, 0.2, 0.4, 0.6), alpha = 0.05) | |
| # plot the power results | |
| ggplot(results, aes(x = beta, y = power, color = factor(n))) + | |
| geom_line() + | |
| geom_point() + | |
| labs(title = "Power Simulation Results", | |
| x = "Effect Size (Beta)", | |
| y = "Power", | |
| color = "Sample Size (n)") | |
| # From this we can see the basic framework of a power simulation. | |
| # Now we have to adapt it for factorial MR in cancer survival. | |
| # That means the dgm function needs to generate | |
| # - genotype | |
| # - proteomic | |
| # - time to event | |
| # And the estimation function needs to run a factorial MR analysis on time to event | |
| # For the time to event part, we can simulate the data using the simsurv package. First generate the protein levels and their interaction | |
| n <- 10000 | |
| g1 <- rbinom(n, 2, 0.3) # genotype for protein 1 | |
| g2 <- rbinom(n, 2, 0.4) # genotype for protein 2 | |
| u <- rnorm(n) # unmeasured confounder | |
| pred1 <- 0.5 * g1 + 0.3 * u | |
| pred2 <- 0.5 * g2 + 0.3 * u | |
| covars <- tibble(protein1 = pred1 + rnorm(n, sd = sqrt(1 - var(pred1))), | |
| protein2 = pred2 + rnorm(n, sd = sqrt(1 - var(pred2))), | |
| interaction = protein1 * protein2) | |
| # we'll need the shape and scale parameters for the Weibull distribution - you can get these from fitting a Weibull model to real data. e.g. | |
| lambdas <- 1.5 | |
| gammas <- 0.01 | |
| # And now we select some hypothesised betas for the proteins and their interaction | |
| betas <- c(protein1 = 0.3, protein2 = 0.4, interaction = 0.2) | |
| # Finally we need a max time for the simulation. This will introduce right-censoring e.g. | |
| maxt <- 5 | |
| simdata <- simsurv(lambdas = lambdas, | |
| gammas = gammas, | |
| betas = betas, | |
| x = covars, | |
| maxt = maxt) | |
| head(simdata) | |
| # For estimation part, if this was just an observational analysis we could use the survival package to fit a Cox proportional hazards model. For example: | |
| library(survival) | |
| cox_model <- coxph(Surv(eventtime, status) ~ protein1 + protein2 + interaction, data = merge(covars, simdata, by = "row.names")) | |
| summary(cox_model) | |
| # But as this is a factorial MR, we will need to use the genetic predictors of protein1, protein2 and the interaction in a two-stage residual inclusion model (2SRI). | |
| # First stage: regress protein1 and protein2 on their respective genotypes to get predicted values | |
| # Second stage: fit a Cox model using the predicted values and their interaction | |
| d <- cbind(covars, simdata, g1, g2) | |
| head(d) | |
| m1 <- lm(protein1 ~ g1 + g2 + g1:g2, data = d) | |
| d$res1 <- residuals(m1) | |
| m2 <- lm(protein2 ~ g1 + g2 + g1:g2, data = d) | |
| d$res2 <- residuals(m2) | |
| head(d) | |
| fit_cox <- coxph(Surv(eventtime, status) ~ protein1 * protein2 + res1 + res2, data = d) | |
| summary(fit_cox) | |
| # Problem is that this doesn't give us correct standard errors, so the p-values will be wrong | |
| # Need to bootstrap the whole procedure to get a distribtion of betas | |
| # Then use the SD(betas) as the standard error to get the p-values | |
| iv_coxph <- function(data, ind=1:nrow(data)) { | |
| d <- data[ind, ] | |
| m1 <- lm(protein1 ~ g1 + g2 + g1:g2, data = d) | |
| d$res1 <- residuals(m1) | |
| m2 <- lm(protein2 ~ g1 + g2 + g1:g2, data = d) | |
| d$res2 <- residuals(m2) | |
| fit_cox <- coxph(Surv(eventtime, status) ~ protein1 * protein2 + res1 + res2, data = d) | |
| return(summary(fit_cox)$coefficients["protein1:protein2", 1]) # return the beta for the interaction term | |
| } | |
| iv_coxph(d) | |
| estimation <- function(data) { | |
| bootres <- boot(data = data, statistic = iv_coxph, R=100) | |
| sd_boot <- sd(bootres$t) | |
| pval <- 2 * (1 - pnorm(abs(bootres$t0 / sd_boot))) | |
| return(pval) | |
| } | |
| # So overall the power simulation framework will be similar to the linear regression example, but with a more complex dgm and estimation function. | |
| dgm <- function(n = 1000, freq1, freq2, rsq_g1p1, rsq_g2p2, beta_p1, beta_p2, beta_interaction, beta_u, lambdas, gammas, maxt) { | |
| g1 <- rbinom(n, 2, freq1) # genotype for protein 1 | |
| g2 <- rbinom(n, 2, freq2) # genotype for protein 2 | |
| u <- rnorm(n) # unmeasured confounder that induces correlation between protein 1 and 2 | |
| pred1 <- scale(g1) * rsq_g1p1 + beta_u * u | |
| pred2 <- scale(g2) * rsq_g2p2 + beta_u * u | |
| covars <- tibble(protein1 = pred1 + rnorm(n, sd = sqrt(1 - var(pred1))), | |
| protein2 = pred2 + rnorm(n, sd = sqrt(1 - var(pred2))), | |
| interaction = protein1 * protein2) | |
| betas <- c(protein1 = beta_p1, protein2 = beta_p2, interaction = beta_interaction) | |
| simdata <- simsurv(lambdas = lambdas, | |
| gammas = gammas, | |
| betas = betas, | |
| x = covars, | |
| maxt = maxt) | |
| d <- cbind(covars, simdata, g1, g2) %>% as_tibble() | |
| return(d) | |
| } | |
| d <- dgm(n = 1000, freq1 = 0.3, freq2 = 0.4, rsq_g1p1 = 0.1, rsq_g2p2 = 0.1, | |
| beta_p1 = 0.3, beta_p2 = 0.4, beta_interaction = 0.2, beta_u = 0.3, | |
| lambdas = 1.5, gammas = 0.01, maxt = 5) | |
| estimation(d) | |
| # And wrap this into the power simulation function as before |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment