diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..c988fde 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,7 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^_pkgdown\.yml$ +^docs$ +^\.github$ +^pkgdown$ +^README\.Rmd$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 0000000..5a774a2 --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,51 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + +name: R-CMD-check.yaml + +permissions: read-all + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true + build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 0000000..2fde25e --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,49 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + release: + types: [published] + workflow_dispatch: + +name: pkgdown.yaml + +permissions: read-all + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to GitHub pages + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@v4.5.0 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.gitignore b/.gitignore index 5b6a065..d0af0bb 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ .Rhistory .RData .Ruserdata +docs/ +inst/doc +.Renviron diff --git a/DESCRIPTION b/DESCRIPTION index 1430e1a..cf0119a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,6 +13,9 @@ Authors@R: c( Depends: R (>= 3.5.0) Description: This is an R package to generate data sets for benchmarking methods on heterogenous treatment effects. +URL: https://openpharma.github.io/benchtm, + https://github.com/openpharma/benchtm +BugReports: https://github.com/openpharma/benchtm/issues License: MIT + file LICENSE Encoding: UTF-8 LazyData: true diff --git a/README.Rmd b/README.Rmd index ca80482..8c65217 100644 --- a/README.Rmd +++ b/README.Rmd @@ -23,7 +23,7 @@ knitr::opts_chunk$set(
{benchtm} aims to generate simulated data sets of heterogeneous treatment effects in clinical trials for benchmarking quantitaive methods. {benchtm} aims to generate simulated data sets of heterogeneous treatment effects in clinical trials for benchmarking quantitative methods.
@@ -48,14 +48,10 @@ devtools::install_github("openpharma/benchtm")
```
## How to use the package
-An .html vignette with introduction to package can be previewed [here](https://raw.githack.com/openpharma/benchtm/master/docs/articles/index.html).
+An .html vignette with introduction to package can be previewed [here](https://openpharma.github.io/benchtm/articles/index.html).
## Contact us
- For any questions, please contact: [Sophie Sun](mailto:sophie.sun@novartis.com)
-
-
-
-
diff --git a/README.md b/README.md
index 81388c2..d3dc953 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
{benchtm} aims to generate simulated data sets of
heterogeneous treatment effects in clinical trials for benchmarking
-quantitaive methods.
+quantitative methods.
YEAR: 2020 -COPYRIGHT HOLDER: Ardalan Mirshani -- -
Copyright (c) 2020
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-a01_User_guidance.RmdR package “benchtm” can be used to generate data for the comparison -of subgroup identification algorithms. To start with, we can directly -load the library using:
- -Consider a two-arm study where trt = 0 represents control arm and trt -= 1 represents treatment arm, the clinical outcome/response \(Y\) is generated from
-\[\begin{equation} -\mathbb{E} (Y|X) = g(f(X)) = g(f_{prog}(X) + trt*(\beta_0 + \beta_1 -f_{pred}(X))), -\end{equation}\]
-where \(g(\cdot)\) is the link
-function and \(f_{prog}(X)\) is the
-functional form of the prognostic variables and \(f_{pred}(X)\) is the functional form of the
-predictive variables; \(\beta_0\) and
-\(\beta_1\) are the coefficients of
-main treatment effect and the predictive effects respectively. From the
-data generation formula one can see that, the larger the \(\beta_0\), the more power there is to
-detect the overall treatment effect. The parameter \(\beta_1\) determines the treatment effect
-variability (increasing if \(|\beta_1|\) gets larger). Larger \(|\beta_1|\) implies that it is more likely
-to detect the subgroup. \(\beta_0\) and
-\(\beta_1\) together determine the
-overall treatment effect. The function generate_y can be
-used to generate the outcome values \(y\).
To generate the binary treatment variable, we consider \[P(trt = 1|X) = \frac{1}{1+\exp(-h(X))}\]
-where \(h(X)\) is a user specified
-form. When \(h(X) = 0\), \(P(trt = 1|X) = 0.5\) represents a setting
-for complete randomized design (have a look at the function
-generate_trt for details).
Depending on different types of responses, the link function \(g(\cdot)\) and how the clinical endpoints -are generated are given as:
-In this section we provide an example of using “benchtm” package to -generate a dataset for subgroup identification problem.
-For the covariates, one can either utilize a user-specified dataset
-or generate them from the function generate_X_dist
-(generate covariates from a pre-specified distribution) or
-generate_X_syn (generate data from synthetic data from a
-specific real trial). See each corresponding function for more
-details.
-set.seed(1111)
-X <- generate_X_dist(n=1000, p=10, rho=0.5)
-#observational data set
-trt <- generate_trt(n=nrow(X), p_trt = 0.5)
-dat <- generate_y(X, trt, prog = "0.5*((X1=='Y')+X3)",
- pred = "X3>0", b0 = 0, b1 = 1,
- type = "continuous", sigma_error = 3)
-
-head(dat)
-#> trt X1 X2 X3 X4 X5 X6 X7 X8
-#> 1 0 N 0.03125259 0.09421509 0.51514432 N M2 -0.98045591 0.1306782
-#> 2 1 N 0.83737617 -1.09363394 -0.55371671 N M3 1.99750426 0.2235778
-#> 3 0 Y 1.18467412 2.28562630 0.13729403 Y M1 1.59946889 -1.0619646
-#> 4 0 N 1.87707398 0.93545962 -0.52007261 N M2 0.12848242 0.8768221
-#> 5 1 Y 0.59686419 0.16525676 0.20060728 N M2 -0.08869092 0.9582106
-#> 6 1 Y 2.00749871 -0.08007514 0.08383461 Y M3 -0.75252169 1.6078133
-#> X9 X10 Y
-#> 1 -2.71313108 0.6942494 1.1067937
-#> 2 0.87700756 1.0092612 -2.6188408
-#> 3 -0.07447738 -0.1410665 3.6697095
-#> 4 -0.11605592 0.4896499 -0.8245078
-#> 5 -0.97397058 0.1617459 2.4050540
-#> 6 -0.30708657 0.1690040 4.9389464The treatment is generated based on randomized trial where \(P(trt = 1) = 0.5\). We could fit a -conditional tree using this generated data.
-
-cov.names <- dat %>% dplyr::select(starts_with("X")) %>% colnames()
-eqn <- paste0("Y ~ trt| ",
- paste0(cov.names, collapse = "+"))
-
-glmtr <- partykit::glmtree(as.formula(eqn), data = dat, family = "gaussian",
- minsize = 0.2*nrow(dat))
-plot(glmtr)
Based on the results, one can see that \(X_3\) is selected as the split variable. -The subgroup with \(X_3>0.061\) has -the largest treatment effect.
-a02_Examples_Real_Data.RmdIn this section we demonstrate how to use “benchtm” package for -reproducible data analysis using real clinical data. The package can -generate outcomes based on user-specific covariates. This would allow -users to perform simulation study using real data as inputs and validate -existing findings. For example it allows to simulate data for the -scenario under no treatment effect heterogeneity (no treatment by -covariate interaction). The simulated data can then be analysed in the -same way as the original data. This will allow to put potential finding -on the original data into context (e.g. how consistent are analysis -results with the hypothesis of no treatment by covariate -interactions).
-As demonstration, we will use data from Prostate -cancer patients. The treatment consisted of a placebo group and -three dose levels of diethyl stilbestrol. The placebo and the lowest -dose level of diethyl stilbestrol were combined to give the control arm, -and the higher doses were combined to give an active treatment arm. Of -506 patients randomized, only 475 with complete data are available on -the dataset. Based on the analysis provided in Exploratory -subgroup analysis in clinical trials by model selection, we only -consider Bone metastasis (BM), History of cardiovascular disease (HX), -and Age (AGE > 65) as covariates. The endpoint is survival time -(SURVTIME) with censoring (CENS = 1:death; CENS = 0 censor). Treatment -is recoded as RX.
-
-library(benchtm)
-library(dplyr)
-library(survival)
-library(haven)
-
-here::i_am("./vignettes/a02_Examples_Real_Data.Rmd")
-
-## BM:Bone metastasis,
-## HX:History of cardiovascular disease
-## AGE:Age, years based on model;
-## RX:treatment
-## SURVTIME: survival time,
-## CENS: 0 for censor and 1 for death
-## reference https://onlinelibrary.wiley.com/doi/10.1002/bimj.201500147
-dat <- read_sas(here::here("./vignettes/adv_prostate_ca.sas7bdat")) %>%
- ## the SAS data in provided in https://onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fbimj.201500147&file=bimj1691-sup-0001-CodeData.zip
- select(BM, HX, AGE, SURVTIME, CENS, RX) %>%
- mutate(AGE_cat = 1*(AGE > 65))
-head(dat)
-#> # A tibble: 6 × 7
-#> BM HX AGE SURVTIME CENS RX AGE_cat
-#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 0 0 75 72.5 0 0 1
-#> 2 0 1 69 40.5 1 1 1
-#> 3 0 1 75 20.5 1 0 1
-#> 4 0 0 67 65.5 0 0 1
-#> 5 0 0 71 24.5 1 0 1
-#> 6 0 0 75 46.5 1 0 1A survival model is fitted assuming exponential distribution. Model -with and without interactions are conducted respectively.
-
-## fit main model (just main/prognostic effects)
-model.main = survreg(Surv(SURVTIME, CENS) ~ RX + BM + HX + AGE_cat, dist = "exponential",
- data = dat)
-summary(model.main)
-#>
-#> Call:
-#> survreg(formula = Surv(SURVTIME, CENS) ~ RX + BM + HX + AGE_cat,
-#> data = dat, dist = "exponential")
-#> Value Std. Error z p
-#> (Intercept) 4.376 0.161 27.22 < 2e-16
-#> RX 0.193 0.112 1.73 0.084
-#> BM -0.719 0.140 -5.15 2.7e-07
-#> HX -0.465 0.110 -4.21 2.5e-05
-#> AGE_cat -0.269 0.152 -1.76 0.078
-#>
-#> Scale fixed at 1
-#>
-#> Exponential distribution
-#> Loglik(model)= -1644.3 Loglik(intercept only)= -1667
-#> Chisq= 45.35 on 4 degrees of freedom, p= 3.4e-09
-#> Number of Newton-Raphson Iterations: 4
-#> n= 475
-
-## model with interactions
-model.inter = survreg(Surv(SURVTIME, CENS) ~ RX * (BM + HX + AGE_cat),
- dist = "exponential",
- data = dat)
-summary(model.inter)
-#>
-#> Call:
-#> survreg(formula = Surv(SURVTIME, CENS) ~ RX * (BM + HX + AGE_cat),
-#> data = dat, dist = "exponential")
-#> Value Std. Error z p
-#> (Intercept) 4.030 0.189 21.29 < 2e-16
-#> RX 0.897 0.309 2.90 0.0037
-#> BM -1.083 0.199 -5.44 5.4e-08
-#> HX -0.347 0.151 -2.30 0.0217
-#> AGE_cat 0.130 0.193 0.67 0.5027
-#> RX:BM 0.603 0.276 2.19 0.0288
-#> RX:HX -0.268 0.220 -1.22 0.2242
-#> RX:AGE_cat -0.810 0.315 -2.57 0.0100
-#>
-#> Scale fixed at 1
-#>
-#> Exponential distribution
-#> Loglik(model)= -1636.8 Loglik(intercept only)= -1667
-#> Chisq= 60.29 on 7 degrees of freedom, p= 1.3e-10
-#> Number of Newton-Raphson Iterations: 4
-#> n= 475From the main model, we can see that Bone metastasis (BM), History of -cardiovascular disease (HX) are significant under exponential -assumption, with interaction included, we can see that Bone metastasis -(BM) and Age have significant interaction with treatment (P-value < -0.05). Based on the point estimates from the interaction model, we could -use “benchtm” package to generate new outcomes based on the existing -covariates in this dataset.
-
-library(ggplot2)
-# library(survminer)
-
-# coefficients obtained from model.inter
-prog_use <- " 1.0830 *BM + 0.347 *HX - 0.13*AGE_cat"
-pred_use <- "-0.603 * BM + 0.27 * HX + 0.81 * AGE_cat"
-b0_use <- -0.897
-b1_use <- 1
-lambda0 <- exp(-4.030)
-max_surv <- max(dat$SURVTIME)
-
-cens_time <- function(n){
- tmp <- rexp(n, rate = 0.01)
- sapply(tmp, function(x) min(x, max_surv))
-}
-set.seed(1)
-dat_generate <- generate_y(X = dat %>% select(BM, HX, AGE_cat), trt = dat$RX,
- prog = prog_use, pred = pred_use, b0 = b0_use, b1 = b1_use,
- type = "survival", cens_time = cens_time, lambda0 = lambda0)
-head(dat_generate)
-#> trt BM HX AGE_cat Y event
-#> 1 0 0 0 1 48.385650 1
-#> 2 1 0 1 1 44.563077 1
-#> 3 0 0 1 1 6.598488 1
-#> 4 0 0 0 1 8.956895 1
-#> 5 0 0 0 1 16.824969 0
-#> 6 0 0 0 1 43.418168 0To check if the generated data is close to the real data, a -Kaplan-Meier curve can be used to compare the survival -probabilities.
-
-dat_combine <- dat_generate %>% select(Y, event) %>%
- rename(SURVTIME = Y, CENS = event) %>%
- rbind(dat %>% select(SURVTIME, CENS)) %>%
- cbind(cat = c(rep("Generated",dim(dat)[1]),rep("Observed",dim(dat)[1])))
-
-## compare ovserved and generated survival time
-fit <- survfit(Surv(SURVTIME, CENS) ~ cat, data = dat_combine)
-survminer::ggsurvplot(fit, data = dat_combine, size = 1, palette = c("#E7B800", "#2E9FDF"),
- conf.int = TRUE, pval = TRUE, risk.table = TRUE, risk.table.col = "strata",
- legend.labs = c("Generated", "Observed"), risk.table.height = 0.25,
- ggtheme = theme_bw()
-)
We can see from the above plot that the generated survival times have -very similar distribution compared to the observed survival times.
-One can also conduct a parametric bootstrap test for existence of
-treatment effect heterogeneity by calculating the log-likelihood ratio
-under the model with and without interation. The null distribution of
-this statistic can be simulated by simulating new data from the model
-without treatment by covariate interactions (model
-model,main above).
-# loglikehood ratios from 100 datasets generated from null model
-loglikratio_null <- sapply(1:1000, function(seed){
- set.seed(seed)
- # prog and pred are based on main model without interaction
- dat_generate <- generate_y(X = dat %>% select(BM, HX, AGE_cat), trt = dat$RX,
- prog = "0.719 *BM + 0.465 *HX +0.269*AGE_cat",
- pred = "0", b0 = -0.193, b1 = 0,
- type = "survival", cens_time = cens_time, lambda0 = exp(-4.376))
- model.full <- coxph(Surv(Y, event) ~ trt * (BM + HX + AGE_cat), data = dat_generate)
- model.null <- coxph(Surv(Y, event) ~ trt + BM + HX + AGE_cat, data = dat_generate)
- (model.full$loglik - model.null$loglik)[2]*(2)
-})
-## loglikehood ratio for real data using cox model
-model.null.obs = coxph(Surv(SURVTIME, CENS) ~ RX + BM + HX + AGE_cat, data = dat)
-model.full.obs = coxph(Surv(SURVTIME, CENS) ~ RX * (BM + HX + AGE_cat), data = dat)
-loglikratio_obs <- (model.full.obs$loglik - model.null.obs$loglik)[2]*2
-result_anova <- anova(model.null.obs, model.full.obs)
-## plot to show the difference
-ggplot(data.frame(x = loglikratio_null), aes(x=x)) +
- geom_density(alpha=.4, fill = "blue") +
- stat_function(fun = dchisq, args = list(df = result_anova$Df[2]), color = "red") +
- geom_vline(xintercept = loglikratio_obs, color = 'black') +
- annotate(geom = 'text', x = loglikratio_obs, y = 0.025, color = 'blue',
- label = paste0('P(X>obs) =',
- sum(loglikratio_null > loglikratio_obs)/length(loglikratio_null)), hjust = -0.1) +
- annotate(geom = 'text', x = loglikratio_obs, y = 0.05, color = 'red',
- label = paste0('P(X>obs) =',
- 1-pchisq(loglikratio_obs, result_anova$Df[2])), hjust = -0.1) +
- theme_bw() + theme(legend.position="bottom")
The blue area is the density of the log-likelihood ratios for the -model with/without interaction under the data generated from main model. -The red line is the density of the chi-squared distribution for the test -of interaction v.s. main model from modeling observed data. It can be -seen that the empirical density matches the theoretical density well. -The black vertical line is the observed log-likelihood ratio for real -data. In response to the generated data test, P-value = 0.002, which is -similar to the chi-square test (P-value = 0.001), indicating the -treatment effect heterogeneity exists(there is interaction effect).
-While the procedure here illustrated the approach for a global test -for treatment effect heterogeneity, also other statistics could be -simulated, for example the treatment effect in an identified subgroup -after an algorithmic subgroup search. This simulation approach may be a -valuable tool for assessing the strength of evidence for a subgroup -finding: One could assess for example how likely it is to find a -subgroup with a particular treatment effect in the case of no treatment -by covariate interaction.
-a03_Examples_Simulation.RmdIn this section a simple simulation study is performed on subgroup -identification using ``benchtm’’ package. In this given example, the -interest is to compare the MSE of estimated individual treatment effect -from two methods: multivariate regression(Seber -and Lee 2012) and Model-based recursive partition (MOB)(Athey and Imbens 2016). For MOB, we build node -model \(Y = \beta_0 + \beta_1*trt + -\beta_2*X\), where \(X\) is the -union of selected variables from lasso model \(Y = \beta_0^* + \beta_1^*X\) on treatment -subset and control subset separately. Users could modify this part of -the code by implementing their own method and metrics for a -comparison.
-We generate the simulation data with \(f_{prog}(X) = 0.5*(X_3+X_7)\) and \(f_{pred}(X) = X_3\) with \(\beta_0 =0, \beta_1=2\). We fix the total -number of covariates as \(p=20\) and -vary the sample size from \(n = 200\) -to \(n = 1000\). The simulation is -replicated for 100 times in each scenario.
-
-library(benchtm)
-library(dplyr)
-library(glmnet)
-
-### generate simulation data
-get_data <- function(n, p, seed = 1){
- set.seed(seed)
- X <- generate_X_dist(n, p, rho=0.5)
- trt <- generate_trt(n, p_trt = 0.5)
- temp_dat <- generate_y(X, trt, prog = "0.5*(X3+X7)",
- pred = "X3", b0 = 0, b1 = 2,
- type = "continuous", include_truth = TRUE,sigma_error = 1)
- return(temp_dat)
-}
-
-multivariate <- function(X, Y, trt){
- fitdat <- cbind(Y, trt, X)
-
- all_vars <- paste0(colnames(X), collapse = "+")
- form1 <- paste0("Y~trt+", all_vars, "+ trt:(", all_vars,")")
-
- ## fit linear models and perform global interaction test
- fit1 <- lm(as.formula(form1), data=fitdat)
-
- ## predict individual treatment effects
- pred1 <- predict(fit1, newdata=cbind(trt=1,X))
- pred0 <- predict(fit1, newdata=cbind(trt=0,X))
- tau_hat <- pred1-pred0
-
- return(tau_hat)
-}
-
-var_select_lasso_tree <- function(X, Y, trt){
- colnames(X) <- paste0(colnames(X),"_")
- ind <- trt == 0
- X0 <- X[ind,]
- X1 <- X[!ind,]
- Y0 <- Y[ind]
- Y1 <- Y[!ind]
- X0 <- model.matrix(~., data=X0, split = "_")[,-1]
- X1 <- model.matrix(~., data=X1)[,-1]
- fit0 <- glmnet::cv.glmnet(X0, Y0, family = "gaussian")
- fit1 <- glmnet::cv.glmnet(X1, Y1, family = "gaussian")
-
- get_fit_vars <- function(fit){
- cf <- coef(fit, s="lambda.1se")
- ## selected variables
- vars <- rownames(cf)[abs(as.numeric(cf))>0]
- }
- vars <- setdiff(union(get_fit_vars(fit0), get_fit_vars(fit1)),"(Intercept)")
- unique(sapply(vars, function(xx) stringr::str_split(xx, "_")[[1]][1]))
-}
-
-
-mobl <- function(X, Y, trt){
- ## select union of variables that impact outcome on treatment and
- ## control (this will be a superset of the purely prognostic variables)
- var_prog <- var_select_lasso_tree(X, Y, trt)
-
- cov.names <- colnames(X)
- dat <- cbind(Y, trt, X)
- if(length(var_prog) == 0){
- eqn <- paste0("Y ~ trt| ",
- paste0(cov.names, collapse = "+"))
- }else{
- eqn <- paste0("Y ~ trt + ",paste0(var_prog, collapse = "+")," | ",
- paste0(cov.names, collapse = "+"))
- }
- glmtr <- partykit::glmtree(as.formula(eqn), data = dat, family = "gaussian",
- parm = 2, minsize = 0.2*nrow(dat), alpha=0.10, bonferroni=TRUE)
-
- tau.hat <- predict(glmtr, newdata = dat %>% mutate(trt = 1), type = "response") -
- predict(glmtr, newdata = dat %>% mutate(trt = 0), type = "response")
-
- return(tau.hat)
-}
-
-
-
-sim_run_cont <- function(n, p, seed){
- data <- get_data(n, p, seed)
- Y <- data$Y
- trt <- data$trt
- X <- data %>% dplyr::select(starts_with("X"))
-
- res_multivariate <- multivariate(X,Y,trt)
- res_mobl <- mobl(X, Y, trt)
-
- mse_multivariate <- mean((res_multivariate - data$trt_effect)^2)
- mse_mobl <- mean((res_mobl - data$trt_effect)^2)
-
- return(data.frame(mse = c(mse_multivariate, mse_mobl),
- method = c("Multivariate", "MOBl"),
- n = rep(n,2),
- seed = rep(seed, 2)))
-}
-
-set.seed(2222) #set seed for random algorithms
-var_change <- expand.grid(n = c(200, 500,800, 1000), seed = 1:100)
-sim_result <- apply(var_change, 1, function(vars){
- sim_run_cont(n = vars[1], p = 20, seed = vars[2])
-}) %>% bind_rows()
-saveRDS(sim_result, "sim_result.rds")We could make a plot to compare the MSE of the estimated treatment -effect vs. the true treatment effect.
-
-library(ggplot2)
-library(dplyr)
-
-sim_result %>% group_by(method, n) %>% summarize(mean_mse = mean(mse)) %>% ungroup() %>%
- ggplot(aes(n, mean_mse, group = method, color = method)) + geom_point() + geom_line() + theme_bw()
a04_synthetic_data.RmdAs described in Sun -et al. (2024) the data set is a synthetic data generated from a pool -of clinical trial data. The ‘’synthpop’’ function from the “synthpop” R -package with default options was used to generate a synthetic version of -the original data. For the sake of confidentiality we renamed the -covariates into \(X_1, X_2, \ldots, -X_{30}\). In addition the factor levels of the categorical -variables were renamed and the numeric variables were scaled into the -interval \([0,1]\) by subtracting the -smallest value and dividing by the range of values.
-In what follows we provide marginal summaries of the variables \(X_1, X_2, ... X_{30}\) using barplots and -density plots. In addition an assessment of the correlations among the -numeric variables is provided.
-



The Kendall’s correlation between the continuous covariates \(X_5,X_{10}\) - \(X_{30}\) is shown in the correlation plot -below.
-
-Correlation for continuous variables for synthetic data -
-calc_power.RdCalculate power for overall treatment effect (not utilizing any -covariates) and the power of the interaction test (under the true -model, which is Y=f_prog+(b0+b1*f_pred)*trt). The calculation is -performed conditional on a given data set of covariates (prognostic -effects specified via prog_vals; predictive effects specified via -pred_vals) and a treatment indicator. It is assumed that the -treatment indicator is independent of the columns in X). The power -of the overall test and the interaction test will depend on the -actual observed covariates. To make calculations independent of the -actual observed covariates (to focus on the super-population) it is -suggested to simulate a large number of covariates (e.g. 100 times -larger sample size than one would be interested in), based on this -one can determine the covariance matrix of the estimates. To obtain -a covariance matrix estimate for the sample size one is interested -in one can then scale up the covariance matrix estimate (e.g. by -100-fold in the example above).
-calc_power(
- b,
- scal,
- prog_vals,
- trt,
- pred_vals,
- alpha,
- type,
- theta,
- sign_better,
- sigma_error,
- lambda0,
- cens_time,
- t_mile
-)Vector of length 2: Coefficient for treatment indicator -and predictive term in the true model
Scaling parameter for the covariance matrix
Vector of main effects (one value per patient)
Vector of treatment indicators
Vector of predictive effects (one value per -patient)
Vector of length 2, specifying the desired type 1 -error for the overall and the interaction test. The overall test -is performed as a one-sided test (see also argument -sign_better). The interaction test is performed as a two-sided -test.
type of data "continuous", "binary", "count" and -"survival" are allowed here, but calculations are only -implemented for "continuous" and "binary".
overdispersion parameter for type "count"
1 if larger response is better, -1 is smaller -response is better (used in power calculation for the overall -test only)
Residual variance assumed for type = "continuous"
Intercept of exponential regression (on non-log scale)
function that generates random censoring times
time-point for comparing the survival probabilities -for test of overall treatment effect
Vector of two power values
-###### generate a matrix of covariates
-scal <- 100
-X <- generate_X_dist(n = 500 * scal, p = 30, rho = 0.5)
-trt <- generate_trt(n = 500 * scal, p_trt = 0.5)
-prog <- "0.5*((X1=='Y')+X11)"
-pred <- "X11>0.5"
-pred_vals <- with(X, eval(parse(text = pred)))
-prog_vals <- with(X, eval(parse(text = prog)))
-alpha <- c(0.025, 0.1)
-calc_power(
- b = c(0, 0), scal, prog_vals, trt, pred_vals, alpha,
- type = "continuous", sigma_error = 1, sign_better = 1
-)
-#> [1] 0.025 0.100
-calc_power(
- b = c(0.2, 0.3), scal, prog_vals, trt, pred_vals, alpha,
- type = "continuous", sigma_error = 1, sign_better = 1
-)
-#> [1] 0.7969131 0.6069333
-calc_power(
- b = c(0, 0), scal, prog_vals, trt, pred_vals, alpha,
- type = "binary", sign_better = 1
-)
-#> [1] 0.025 0.100
-calc_power(
- b = c(0.5, 0.8), scal, prog_vals, trt, pred_vals, alpha,
- type = "binary", sign_better = 1
-)
-#> [1] 0.9161490 0.5881828
-generate_X_dist.RdGenerate X from pre-specified distribution. X1 is a -Bernoulli distributed rv with probability 0.5. X2 an exponential -distribution with lambda=1. X3,X4,Z are standard normal variates -with corelation rho and X5=(Z>1), X6 is from a multinomial -distribution with 4 equally probable categories. X7-Xp are -independent standard normal distributions.
-generate_X_dist(n, p, rho = 0.5)Number of observations
Number of predictors requested
Correlation between variable 3-5
Data frame of predictors
-X <- generate_X_dist(n=100, p=10, rho=0.5)
-generate_X_syn.RdGenerate data from synthetic clinical trial data with dimension n -and 30 variables.
-generate_X_syn(n)Number of observations
A data frame of predictors
-The data are generated based on the data-set data_syn which is not -exported from the package namespace. This data-set was generated -from pooled set of clinical trial data using the synthpop function -from the synthpop R package. data_syn has 1934 rows and 30 -variables. Variable names have been removed. Continuous variables -have been scaled to the interval [0,1], levels of the categorical -variable have been blinded. For information on the synthetic data -which synthpop generates from, see the supplementary material of -the XYZ reference.
-X <- generate_X_syn(n=100)
-#> Warning: In your synthesis there are numeric variables with 5 or fewer levels: X10.
-#> Consider changing them to factors. You can do it using parameter 'minnumlevels'.
-
-generate_scen_data.RdGenerate simulation data from simulation scenarios used in XYZ. The -scenarios were derived so that the prognostic part of the model on -control as an R^2 of 0.32 for continuous data and an AUC of 0.66 -for binary data. The coefficient b0 was calculated in each scenario so -that the overall test for a treatment effect has a power of 50 -2.5 -expressions for prognostic and predictive part) replicated across -continuous and binary endpoints. Within each scenario there are 5 -sub-scenarios corresponding to different selections of b1. The -third scenario in each of the sub-scenarios correspond to the scenario -where the interaction test (under the true model) has 80 -(for 20 -considered). The other 4 scenarios correspond to 0, 0.5, 1.5, 2 times -the b1 value that provides 80 -total there are hence 4x2x5=40 scenarios.
-generate_scen_data(scen, include_truth = TRUE, type = c("sample", "resample"))A row from the scen_param data set or scen_param_TTE data set for time-to-event data
Boolean, will the true treatment effect be included in the outcome data-set?
For type == "sample" (default) X is generated using R package synthpop (using function -generate_X_syn). For type == "resample" data are resampled from a large saved data-set generated - from generate_X_syn (this option is considerably faster).
A data frame
-data(scen_param) ## scenarios used in XYZ
-dat <- generate_scen_data(scen = scen_param[1, ])
-#> Warning: In your synthesis there are numeric variables with 5 or fewer levels: X10.
-#> Consider changing them to factors. You can do it using parameter 'minnumlevels'.
-generate_trt.RdThe proportion of assignment to treatment is p_trt. Generation can be done -fixing the proportion of patients in the treatment group to be as -close as possible to p_trt and then permuting (type = "exact") or -random sampling from a binomial distribution with probability p_trt -(type = "random"). For type = "random" one can also specify a -formula for logit(P(trt|X)) via "prop". The probability of obtaining -treatment is then binomial with P(trt|X) = 1/(1+exp(-prop)). The variables -names specified in "prop" need to be handed over in a data frame "X".
-generate_trt(
- n,
- p_trt = 0.5,
- type = c("exact", "random"),
- X = NULL,
- prop = NULL
-)Total sample size
Proportion of treatment assignment (default = 0.5)
Either "exact" (proportion of p_trt is tried to match exactly), or -"random" then treatment assignment is generated from a binomial distribution.
matrix/dataframe of predictor variables
A formula used to generate logit(P(trt|X)). Probability of -obtaining treatment is then Bin(1, P(trt|X)) with -P(trt|X) = 1/(1+exp(-prop)). If type = "random" and prop is not specified -p_trt is used in the binomial distribution
Vector of treatment indicators
-## by default try to match target trt proportion in observed trt proportion
-trt <- generate_trt(10) # p_trt = 0.5 is default
-table(trt)
-#> trt
-#> 0 1
-#> 5 5
-trt <- generate_trt(10, p_trt = 0.2)
-trt <- generate_trt(11, p_trt = 0.5)
-table(trt) # remaining patient randomly allocated
-#> trt
-#> 0 1
-#> 6 5
-## example for random allocation
-trt <- generate_trt(10, p_trt = 0.5, type = "random") # samples from binomial with p_trt = 0.5
-table(trt) # observed allocation may deviate from p_trt=0.5 for small samples
-#> trt
-#> 0 1
-#> 6 4
-
-## example, where treatment allocation depends on propensity model
-X <- generate_X_dist(n=10000, p=10, rho=0.5)
-## Use propensity score for treatment as 1/(1+exp(-X2))
-trt <- generate_trt(n=nrow(X), type = "random", X=X, prop = "X2")
-## fit propensity model
-dat <- cbind(trt, X)
-fit <- glm(trt~., data=dat, family=binomial)
-summary(fit)
-#>
-#> Call:
-#> glm(formula = trt ~ ., family = binomial, data = dat)
-#>
-#> Coefficients:
-#> Estimate Std. Error z value Pr(>|z|)
-#> (Intercept) -0.070667 0.061744 -1.145 0.2524
-#> X1Y -0.011955 0.045543 -0.262 0.7929
-#> X2 1.038493 0.037639 27.591 <2e-16 ***
-#> X3 -0.015390 0.027208 -0.566 0.5716
-#> X4 0.018967 0.027019 0.702 0.4827
-#> X5Y 0.050035 0.051081 0.980 0.3273
-#> X6M2 -0.024738 0.063729 -0.388 0.6979
-#> X6M3 0.023012 0.064293 0.358 0.7204
-#> X6M4 -0.015852 0.064401 -0.246 0.8056
-#> X7 -0.015343 0.022743 -0.675 0.4999
-#> X8 0.001252 0.022563 0.055 0.9557
-#> X9 0.001686 0.022533 0.075 0.9404
-#> X10 0.049464 0.022766 2.173 0.0298 *
-#> ---
-#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-#>
-#> (Dispersion parameter for binomial family taken to be 1)
-#>
-#> Null deviance: 12406 on 9999 degrees of freedom
-#> Residual deviance: 11270 on 9987 degrees of freedom
-#> AIC: 11296
-#>
-#> Number of Fisher Scoring iterations: 5
-#>
-generate_y.RdGenerate simulated data according to the model f(X) = f_prog(X) + -trt*(b0 + b1*f_pred(X)) for different outcome distributions. For -continuous data f(X) is the conditional mean, for binary data the -logit response probility, for count data the log-rate and for -survival data the log-hazard rate.
-generate_y(
- X,
- trt,
- prog,
- pred,
- b0,
- b1 = NULL,
- sd_te = NULL,
- type = c("binary", "continuous", "count", "survival"),
- sigma_error = 1,
- theta = 1,
- cens_time = NULL,
- lambda0 = NULL,
- include_truth = FALSE,
- sign_better = 1
-)matrix/dataframe of predictor variables
Binary treatment indicator variable
Character variable giving expression for prognostic -effects (defined in terms of names in the X matrix)
Character variable giving expression for predictive -effects (defined in terms of names in the X matrix)
Treatment (main effect)
Coefficient of the predictive effects defined in pred
Standard deviation of the treatment effects defined -via pred. If given b1 is ignored. For binary data this is assumed -to be on the log-odds ratio scale. For survival and count data -this is assumed to be on the log scale.
Outcome data type to generate ("continuous", "binary", -"count" and "survival" are allowed here)
Residual error, only needed for type = -"continuous"
Overdispersion parameter, only needed for data_type = -"count" (variance of neg bin in this parameterization is mu + -mu^2/theta)
Function to generate the censoring time, only -needed for data_type = "survival"
Intercept of exponential regression (on non-log scale)
boolean, will the true prognostic and -predictive effects be included in the outcome data-set?
whether larger response is better (used to -determine whether b1 is negative or positive if not given)
Data set
-X <- generate_X_dist(n = 10000, p = 10, rho = 0.5)
-## observational data set
-trt <- generate_trt(n = nrow(X), type = "random", X = X, prop = "X2")
-dat <- generate_y(X, trt,
- prog = "0.5*((X1=='Y')+X3)",
- pred = "X3>0", b0 = 0, b1 = 1,
- type = "continuous", sigma_error = 3
-)
-
-
-#### generate data from user specified covariate X
-X <- sapply(1:10, function(ii) {
- rnorm(500)
-})
-X <- as.data.frame(X)
-colnames(X) <- paste0("Z", 1:10)
-trt <- generate_trt(nrow(X), p_trt = 0.5)
-dat <- generate_y(X, trt,
- prog = "0.5*Z1",
- pred = "(Z5>0)", b0 = 0, b1 = 1,
- type = "binary"
-)
-glm(Y ~ trt * ., data = dat, family = binomial)
-#>
-#> Call: glm(formula = Y ~ trt * ., family = binomial, data = dat)
-#>
-#> Coefficients:
-#> (Intercept) trt Z1 Z2 Z3 Z4
-#> 0.04097 0.66630 0.48682 -0.27904 -0.10889 -0.18297
-#> Z5 Z6 Z7 Z8 Z9 Z10
-#> -0.13666 0.06119 -0.01345 -0.21496 0.14267 -0.11297
-#> trt:Z1 trt:Z2 trt:Z3 trt:Z4 trt:Z5 trt:Z6
-#> -0.06650 0.09557 -0.01194 0.06109 0.48794 -0.10708
-#> trt:Z7 trt:Z8 trt:Z9 trt:Z10
-#> -0.06243 0.24723 -0.09472 -0.07955
-#>
-#> Degrees of Freedom: 499 Total (i.e. Null); 478 Residual
-#> Null Deviance: 679.6
-#> Residual Deviance: 625.7 AIC: 669.7
-get_b.RdCalculate the model parameters b0 and b1 that provide a given -target power for testing for an overall treatment effect and an -interaction effect (under the true model). The calculation is -performed conditional on a given data set of covariates and -treatment indicator. It is assumed that the treatment indicator is -independent of the columns in X). The power of the overall test and -the interaction test will depend on the actual observed -covariates. To make calculations independent of the actual observed -covariates (to focus on the super-population) it is suggested to -simulate a large number of covariates (e.g. 100 times larger sample -size than one would be interested in), based on this one can -calculate an estimate of the covariance matrix of the estimates. To -obtain a covariance matrix estimate for the sample size one is -interested in one can then scale up the covariance matrix estimate -(e.g. by 100-fold in the example above). Note that there may not be -any solution or a unique solution, in case of non-convergence -(which particularly happens for the binary endpoint) it sometimes -helps to modify start and optim_method arguments.
-Matrix with all covariates
Scaling parameter for the covariance matrix
Character variable giving expression for prognostic -effects (defined in terms of names in the X matrix)
Character variable giving expression for predictive -effects (defined in terms of names in the X matrix)
Treatment effect indicator (same length as number of rows in X).
type of data "continuous", "binary", "count" and -"survival" are allowed here, but calculations are only -implemented for "continuous" and "binary".
Vector of length 2, specifying the target power for the -overall and the interaction test
Vector of length 2, specifying the desired type 1 -error for the overall and the interaction test. The overall test -is performed as a one-sided test (see also argument -sign_better). The interaction test is performed as a two-sided -test.
Vector of length 2: Starting values for the parameter b.
1 if larger response is better, -1 is smaller -response is better (used in power calculation for the overall -test only)
Residual variance assumed for type = "continuous"
overdispersion paramter for type = count". It is -consistent with MASS::glm.nb and the (dprq)nbinom functions in R -with the "size" parameter equal to theta. In this parameterization -the negative binomial distribution converges to the Poisson -distribution for theta goes to infinity.
Intercept of exponential regression (on non-log scale)
Function to generate the censoring time, only -needed for data_type = "survival"
Time point for comparing survival probabilities for overall -test for treatment effect
method argument in the optimization see ?optim
Vector of two model parameters b0 and b1
-scal <- 100
-X <- generate_X_dist(n = 500 * scal, p = 30, rho = 0.5)
-trt <- generate_trt(n = 500 * scal, p_trt = 0.5)
-prog <- "0.5*((X1=='Y')+X11)"
-pred <- "X11>0.5"
-get_b(X, scal, prog, pred, trt,
- type = "continuous",
- power = c(0.9, 0.9), alpha = c(0.025, 0.1),
- start = c(0, 0), sigma_error = 1
-)
-#> [1] 0.2038864 0.4571681
-#> attr(,"power_results")
-#> [1] 0.9 0.9
-get_b(X, scal, prog, pred, trt,
- type = "binary",
- power = c(0.9, 0.9), alpha = c(0.025, 0.1),
- start = c(0, 0)
-)
-#> [1] 0.3866068 1.4969530
-#> attr(,"power_results")
-#> [1] 0.9 0.9
-get_b0.RdCalculate the model parameters b0 (for a given b1) that provides a -given target power for testing for an overall treatment effect. -The calculation is performed conditional on a given data set of -covariates and treatment indicator. The power of the overall test -depends on the actual observed covariates. To make calculations -independent of the actual observed covariates (to focus on the -super-population) it is suggested to simulate a large number of -covariates (e.g. 100 times larger sample size than one would be -interested in), based on this one can calculate determine the -covariance matrix of the estimates. To obtain a covariance matrix -estimate for the sample size one is interested in one can then -scale up the covariance matrix estimate (e.g. by 100-fold in the -example above).
-get_b0(
- X,
- scal,
- prog,
- pred,
- trt,
- b1,
- type,
- power = 0.9,
- alpha = 0.025,
- sign_better = 1,
- sigma_error,
- theta,
- lambda0,
- cens_time,
- t_mile,
- interval = c(-5, 5)
-)Matrix with all covariates
Scaling parameter for the covariance matrix
Character variable giving expression for prognostic -effects (defined in terms of names in the X matrix)
Character variable giving expression for predictive -effects (defined in terms of names in the X matrix)
Treatment effect indicator (same length as number of rows in X).
The parameter b1 (calculation of b0 is for a given b1)
type of data "continuous", "binary", "count" and -"survival" are allowed here, but calculations are only -implemented for "continuous" and "binary" currently.
Vector of length 2, specifying the target power for the -overall and the interaction test
Vector of length 2, specifying the desired type 1 -error for the overall and the interaction test. The overall test -is performed as a one-sided test (see also argument -sign_better). The interaction test is performed as a two-sided -test.
1 if larger response is better, -1 is smaller -response is better (used in power calculation for the overall -test only)
Residual variance assumed for type = "continuous"
overdispersion paramter for type = count". It is -consistent with MASS::glm.nb and the (dprq)nbinom functions in R -with the "size" parameter equal to theta. In this parameterization -the negative binomial distribution converges to the Poisson -distribution for theta goes to infinity.
Intercept of exponential regression (on non-log scale)
Function to generate the censoring time, only -needed for data_type = "survival"
Time point for comparing survival probabilities for overall -test for treatment effect
Interval to search for b0 handed over to uniroot
Vector of model parameter b0
-
-scal <- 100
-X <- generate_X_dist(n = 500 * scal, p = 30, rho = 0.5)
-trt <- generate_trt(n = 500 * scal, p_trt = 0.5)
-prog <- "0.5*((X1=='Y')+X11)"
-pred <- "X11>0.5"
-get_b0(X, scal, prog, pred, trt,
- b1 = 0.5, type = "continuous",
- power = 0.9, alpha = 0.025, interval = c(-2, 2), sigma_error = 1
-)
-#> [1] 0.1913504
-#> attr(,"power_results")
-#> [1] 0.899987
-get_b0(X, scal, prog, pred, trt,
- b1 = 0.5, type = "binary",
- power = 0.9, alpha = 0.025, interval = c(-2, 2)
-)
-#> [1] 0.5320286
-#> attr(,"power_results")
-#> [1] 0.9000235
-
- All functions- - |
- |
|---|---|
| - - | -Power calculation to detect overall treatment effect and interaction effect |
-
| - - | -Generate X from pre-specified distribution. |
-
| - - | -Generate X from data "syn" |
-
| - - | -Generate simulation data from pre-specified scenarios |
-
| - - | -Generate binary treatment variable |
-
| - - | -Generate outcome variable y and merge with predictor data matrix X. |
-
| - - | -Calculate b parameter to achieve a given power to detect an overall -effect and the interaction effect |
-
| - - | -Calculate b0 parameter to achieve a given power to detect an overall -effect and the interaction effect |
-
| - - | -Pre-specified simulation scenarios from XYZ |
-
scen_param.RdThe scenarios were derived so that the prognostic part of the model -on the control arm has an R^2 of 0.32 for continuous data and an -AUC of 0.66 for binary data. The coefficient b0 was calculated in -each case so that the overall test for a treatment effect has a -power of 50 -(defined by the expressions for prognostic and predictive part) -replicated across continuous and binary endpoints. Within each -scenario there are 5 sub-scenarios corresponding to different -selections of b1. The third case in each of the sub-scenarios -correspond to the case where the interaction test (under the true -model) has 80 -b1 > 0 is considered). The other 4 cases correspond to 0, 0.5, 1.5, -2 times the b1 value that provides 80 -test. The column b1_rel gives the size of b1 relative to the b1 -value that provides 80 -scenarios.
-scen_paramSee more details on the choice of parameters considered in -https://pubmed.ncbi.nlm.nih.gov/36437036/
-See more details on the the data cases in https://pubmed.ncbi.nlm.nih.gov/36437036/
-