--- title: 'Analysis' output: html_document: default editor_options: chunk_output_type: console header-includes: - \usepackage{amsmath} - \usepackage{array} - \usepackage{natbib} - \usepackage{placeins} - \usepackage{orcidlink} - \usepackage{caption} - \captionsetup[table]{skip=5pt} - \usepackage{ulem} - \usepackage{enumitem} - \usepackage{rotating} vignette: > %\VignetteIndexEntry{analysis} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::knitr} --- **This file contains the code for reproducing the results shown in the related manuscript (numbers, tables, figures). We obtained our results on 2023-09-24 using R version 4.3.0 (2023-04-21) running natively on a MacBook Pro (2020) with an Apple M1 chip, 16 GB memory, and the operating system macOS Ventura (version 13.5.2). The total computation time was around 11 hours.** # Initialisation Choose the directory containing the sub-directories "data", "results" and "manuscript". - The directory "data" may be empty for the first external application (*script loads data - which are also available [here](https://github.com/markvdwiel/GRridge/blob/master/data/dataVerlaat.rda) - from the R package [`GRridge`](https://doi.org/10.18129/B9.bioc.GRridge)*) but must contain the file "pone.0181468.s001.csv" for the second external application (*available from [Erez et al. 2017](https://doi.org/10.1371/journal.pone.0181468) in the supporting file [journal.pone.0181468.s001.csv](https://doi.org/10.1371/journal.pone.0181468.s001)*) and the files "vcf_with_pvalue.tab" and "LuxPark_pheno.txt" for the internal application (*available upon request to [request.ncer-pd@uni.lu](mailto:request.ncer-pd@uni.lu?cc=armin.rauschenberger@uni.lu,enrico.glaab@uni.lu&subject=data request Rauschenberger et al. ("transreg"))*). For the internal application, this directory will also contain the intermediate file "app_int_data.RData". - The directory "results" will contain the files "sim_int.RData" and "sim_ext.RData" for the external and internal simulation, the file "app_grridge.RData" for the first external application, the file "app_fwelnet.RData" for the second external application, and the file "app_int.RData" for the internal application. - The directory "manuscript" will contain the files "table_int.tex" and "table_ext.tex" for the internal and external simulations, the file "figure_example.pdf" for the methods section, the file "figure_ext.pdf" for the external applications, and the file "figure_int.pdf" for the internal application. ```{r knitr,echo=FALSE} knitr::opts_chunk$set(eval=FALSE,echo=TRUE) ``` ```{r init,echo=TRUE} rm(list=ls()) dir <- "~/Desktop/transreg" # physical machine #dir <- "/home/armin.rauschenberger/transreg" # virtual machine setwd(dir) if(!all(c("data","results","manuscript") %in% dir())){ stop("Missing folders!") } ``` Install missing R packages from CRAN and GitHub. Note that `ecpc` is and `transreg` will also be available on CRAN. For package versions, see session information at the end of this document and in the text files associated with each R data file. ```{r pkgs,eval=FALSE,echo=TRUE} pkgs <- c("devtools","palasso","glmtrans","xtable","ecpc","xrnet") utils::install.packages(setdiff(pkgs,rownames(installed.packages()))) remotes::install_github("markvdwiel/GRridge") # ref="ef706afe", version 1.7.5 remotes::install_github("kjytay/fwelnet") # ref="5515fd2e", version 0.1 remotes::install_github("LCSB-BDS/transreg") # ref="82757441", version 1.0.0 rm(pkgs) ``` # Methods The following chunk generates the figure for the methods section. - input: none (*simulating data*) - execution time: some seconds - output: manuscript/figure_example.pdf ```{r calibration,echo=TRUE} #<> #grDevices::pdf(file=paste0(dir,"/manuscript/figure_example.pdf"),width=8,height=5,pointsize=15) #grDevices::png(file=paste0(dir,"/manuscript/figure_example.png"),width=8,height=5,units="in",pointsize=15,res=1200) grDevices::postscript(file=paste0(dir,"/manuscript/figure_example.eps"),width=8,height=5,pointsize=15,horizontal=FALSE,paper="special") set.seed(1) n <- 200; p <- 500 X <- matrix(stats::rnorm(n*p),nrow=n,ncol=p) temp <- stats::rnorm(p) range <- stats::qnorm(p=c(0.01,0.99)) temp[temprange[2]] <- range[2] beta <- list() beta$ident <- temp beta$sqrt <- sign(temp)*sqrt(abs(temp)) beta$quad <- sign(temp)*abs(temp)^2 beta$trunc <- ifelse(temp<=0,0,temp) beta$step <- ifelse(temp<=1,0,1) beta$combn <- ifelse(temp<0,sign(temp)*sqrt(abs(temp)),sign(temp)*abs(temp)^2) graphics::par(mfrow=c(2,3),mar=c(3,3,0.5,0.5)) for(i in seq_along(beta)){ prior <- matrix(temp,ncol=1) eta <- X %*% beta[[i]] y <- stats::rnorm(n=n,mean=eta,sd=sd(eta)) a <- transreg:::.exp.multiple(y=y,X=X,prior=prior,family="gaussian",select=FALSE) b <- transreg:::.iso.fast.single(y=y,X=X,prior=prior,family="gaussian") graphics::plot.new() graphics::plot.window(xlim=range(prior,-prior),ylim=range(a$beta,b$beta)) graphics::axis(side=1) graphics::axis(side=2) graphics::abline(h=0,lty=2,col="grey") graphics::abline(v=0,lty=2,col="grey") graphics::box() graphics::title(xlab=expression(z),ylab=expression(gamma),line=2) graphics::points(x=prior,y=a$beta,col="red",cex=0.7) graphics::points(x=prior,y=b$beta,col="blue",cex=0.7) graphics::lines(x=prior[order(prior)],y=beta[[i]][order(prior)],lwd=1.2) graphics::legend(x="topleft",legend=paste0("(",i,")"),bty="n",x.intersp=0) } grDevices::dev.off() ``` # Simulations The following chunk performs the external (`glmtrans`) and the internal simulation. - input: none (*simulating data*) - execution time: **2 + 1 = 3 hours** - output: results/sim_ext.RData, results/sim_int.RData, results/info_sim.RData ```{r sim_script,eval=FALSE,echo=TRUE} #<> # - - - modify glmtrans::models function - - - glmtrans.models <- glmtrans::models string <- base::deparse(glmtrans.models) # return target beta string <- gsub(pattern="list\$x \\= NULL, y \\= NULL\$", replacement="list(x = NULL, y = NULL, beta = wk)", x=string) # return source beta string <- gsub(pattern="list\$x \\= x, y \\= y\$", replacement="list(x = x, y = y, beta = wk)", x=string) glmtrans.models <- eval(parse(text=string)) rm(string) # - - - - - - - - - - - - - - - - - - - - - - for(mode in c("ext","int")){ # simulation setting if(mode=="ext"){ frame <- expand.grid(seed=1:10, Ka=as.integer(c(1,3,5)), K=as.integer(5), h=as.integer(c(5,250)), alpha=as.integer(c(0,1)), family=c("gaussian","binomial") ) } else if(mode=="int"){ frame <- expand.grid(seed=1:10, rho.x=c(0.95,0.99), rho.beta=c(0.6,0.8,0.99), alpha=as.integer(c(0,1)), family=c("gaussian","binomial") ) } frame$family <- as.character(frame$family) frame[,c("cor.x","cor.beta","mean","glmnet","glmtrans","transreg")] <- NA n0 <- 100; n1 <- 10000 n.target <- n0+n1 foldid.ext <- rep(c(0,1),times=c(n0,n1)) for(iter in seq_len(nrow(frame))){ cat(paste0(mode,": ",iter,"/",nrow(frame),"\n")) if(!is.na(frame$seed[iter])){set.seed(frame$seed[iter])} # data simulation if(mode=="ext"){ message("Using external simulation study!") s <- ifelse(frame$alpha[iter]==0,50,15) data <- glmtrans.models(family=frame$family[iter],type="all", p=1000,n.target=n.target,s=s, Ka=frame$Ka[iter],K=frame$K[iter],h=frame$h[iter]) target <- data$target source <- data$source beta <- cbind(sapply(data$source,function(x) x$beta),data$target$beta) } else if(mode=="int"){ message("Using internal simulation study!") prop <- ifelse(frame$alpha[iter]==0,0.3,0.05) data <- transreg:::simulate(p=500,n.target=n.target,family=frame$family[iter], prop=prop,rho.beta=frame$rho.beta[iter],w=0.8, rho.x=frame$rho.x[iter],k=3,exp=c(1,2,0.5), trans=c(FALSE,TRUE,TRUE)) target <- data$target source <- data$source beta <- data$beta } # correlation temp <- abs(stats::cor(data$target$x,method="pearson")) temp[lower.tri(temp,diag=TRUE)] <- NA frame$cor.x[iter] <- mean(temp,na.rm=TRUE) temp <- abs(stats::cor(beta,method="pearson")) temp[lower.tri(temp,diag=TRUE)] <- NA frame$cor.beta[iter] <- max(temp[,ncol(temp)],na.rm=TRUE) # predictive performance loss <- transreg:::compare(target=target,source=source, family=frame$family[iter],alpha=frame$alpha[iter], foldid.ext=foldid.ext,nfolds.ext=1, scale=c("exp","iso"), sign=FALSE,switch=FALSE,select=TRUE,alpha.prior=NULL, seed=frame$seed[iter],xrnet=TRUE) frame[iter,names(loss$deviance)] <- loss$deviance } save(frame,file=paste0(dir,"/results/sim_",mode,".RData")) } writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(dir,"/results/info_sim.txt")) ``` The following chunk generates the tables for the external (`glmtrans`) and the internal simulation. **Requires execution of previous chunk.** - input: results/sim_ext.RData, results/sim_int.RData - execution time: some seconds - output: manuscript/table_ext.tex, manuscript/table_int.tex ```{r sim_table,echo=TRUE} <> for(mode in c("ext","int")){ file <- paste0(dir,"/results/sim_",mode,".RData") #if(mode=="int"){file <- "~/Desktop/transreg/results/sim_int_fast.RData"} if(!file.exists(file)){warning("Missing file ",file,".");next} load(file) colnames(frame) <- gsub(pattern="transreg.",replacement="",x=colnames(frame)) info.var <- c("Ka","h","rho.x","rho.beta","alpha","family") info <- frame[,colnames(frame) %in% info.var] info.name <- gsub(pattern=" ",replacement="",x=apply(info,1,function(x) paste0(x,collapse="."))) number <- as.numeric(factor(info.name,levels=unique(info.name))) info <- unique(info) info$cor.beta <- info$cor.x <- NA loss.var <- c("glmnet","glmtrans","xrnet","exp.sta","exp.sim","iso.sta","iso.sim") loss <- frame[,c(loss.var,"mean")] # as percentage of empty model loss <- 100*loss/loss$mean # average over 10 different seeds both <- mean <- sd <- p.better <- p.worse <- matrix(NA,nrow=max(number),ncol=length(loss.var),dimnames=list(1:max(number),loss.var)) for(i in 1:max(number)){ for(j in seq_along(loss.var)){ cond <- number==i x <- loss$glmnet[cond] y <- loss[[loss.var[j]]][cond] p.better[i,j] <- stats::wilcox.test(x=x,y=y,paired=TRUE,alternative="greater",exact=FALSE)$p.value p.worse[i,j] <- stats::wilcox.test(x=x,y=y,paired=TRUE,alternative="less",exact=FALSE)$p.value mean[i,j] <- mean(y) sd[i,j] <- sd(y) info$cor.x[i] <- mean(frame$cor.x[cond]) info$cor.beta[i] <- mean(frame$cor.beta[cond]) } } front <- format(round(mean,digits=1),trim=TRUE) front.nchar <- nchar(front) back <- format(round(sd,digits=1),trim=TRUE) back.nchar <- nchar(back) both[] <- paste0(front,"$\\pm$",back) grey <- mean>=mean[,"glmnet"] both[grey] <- paste0("\\textcolor{gray}{",both[grey],"}") min <- cbind(seq_len(max(number)),apply(mean,1,which.min)) both[min] <- paste0("\\underline{",both[min],"}") star <- p.better<=0.05 both[star] <- paste0(both[star],"*") #both[!star] <- paste0(both[!star],"\\phantom{*}") dagger <- p.worse<=0.05 both[dagger] <- paste0(both[dagger],"$\\dagger$") both[!star & !dagger] <- paste0(both[!star & !dagger],"\\phantom{*}") both[front.nchar==3] <- paste0("$~$",both[front.nchar==3]) both[back.nchar==3] <- paste0(both[back.nchar==3],"$~$") external <- "number of transferable source data sets ($K_a$), differences between source and target coefficients ($h$), dense setting with ridge regularisation ($s=50$, $\\alpha=0$) or sparse setting with lasso regularisation ($s=15$, $\\alpha=1$), family of distribution (`gaussian' or `binomial')." internal <- "correlation parameter for features ($\\rho_x$), correlation parameter for coefficients ($\\rho_{\\beta}$), dense setting with ridge regularisation ($\\pi=30\\%$, $\\alpha=0$) or sparse setting with lasso regularisation ($\\pi=5\\%$, $\\alpha=1$), family of distribution (`gaussian' or `binomial')." caption <- paste0("Predictive performance in ",mode,"ernal simulation. In each setting (row), we simulate $10$ data sets, calculate the performance metric (mean-squared error for numerical prediction, logistic deviance for binary classification) for the test sets, express these metrics as percentages of those from prediction by the mean, and show the mean and standard deviation of these percentages. Settings: ",ifelse(mode=="ext",external,ifelse(mode=="int",internal,NULL))," These parameters determine (i) the mean Pearson correlation among the features in the target data set ($\\bar{\\rho}_x$) and (ii) the maximum Pearson correlation between the coefficients in the target data set and the coefficients in the source data sets ($\\max(\\hat{\\rho}_{\\beta})$). Methods: regularised regression (\\texttt{glmnet}), competing transfer learning methods (\\texttt{glmtrans} , \\texttt{xrnet}), proposed transfer learning method (\\texttt{transreg}) with exponential/isotonic calibration and standard/simultaneous stacking. In each setting, the colour black (grey) highlights methods that are more (less) predictive than regularised regression without transfer learning (\\texttt{glmnet}), asterisks (daggers) indicate methods that are \\textit{significantly} more (less) predictive at the 5\\% level (one-sided Wilcoxon signed-rank test), and an underline highlights the most predictive method. \\label{sim_",mode,"}") colnames(info) <- sapply(colnames(info),function(x) switch(x,"Ka"="$K_a$","K"="$K$","h"="$h$","alpha"="$\\alpha$","rho.x"="$\\rho_x$","rho.beta"="$\\rho_\\beta$","cor.x"="$\\bar{\\rho}_{x}$","cor.beta"="$\\max(\\hat{\\rho}_{\\beta})$","cor.t"="$\\bar{\\rho}_{a}$",x)) colnames(both) <- paste0("\\texttt{",colnames(both),"}") align <- paste0("|r|",paste0(rep("c",times=ncol(info)),collapse=""),"|",paste0(rep("c",times=ncol(both)),collapse=""),"|") add.to.row <- list() add.to.row$pos <- list(-1) add.to.row$command <- "\\multicolumn{9}{c}{~} & \\multicolumn{4}{|c|}{\\texttt{transreg}}\\\\" xtable <- xtable::xtable(x=cbind(info,both),align=align,caption=caption) xtable::print.xtable(x=xtable, include.rownames=FALSE, floating=TRUE, sanitize.text.function=identity, comment=FALSE, caption.placement="top", floating.environment="table*", #size="\\small \\setlength{\\tabcolsep}{3pt}", file=paste0(dir,"/manuscript/table_",mode,".tex"), add.to.row=add.to.row, hline.after=c(-1,0,nrow(xtable))) } ``` # External applications The following chunk performs the first external application (`GRridge`). - input: none (*script loads data - which are also available [here](https://github.com/markvdwiel/GRridge/blob/master/data/dataVerlaat.rda) - from the R package [`GRridge`](https://doi.org/10.18129/B9.bioc.GRridge)*) - execution time: **2.5 hours** - output: results/app_grridge.RData, results/info_app_grridge.txt ```{r grridge_script,eval=FALSE,echo=TRUE} #<> data(dataVerlaat,package="GRridge") target <- list() target$y <- as.numeric(as.factor(respVerlaat))-1 target$x <- t(datcenVerlaat) z <- -log10(pvalFarkas) # ecpc and fwelnet prior <- sign(diffmeanFarkas)*(-log10(pvalFarkas)) # transreg loss <- list() for(i in 1:10){ cat("---",i,"---\n") loss[[i]] <- transreg:::compare(target=target,prior=prior,z=as.matrix(z,ncol=1), family="binomial",alpha=0,scale=c("exp","iso"),sign=FALSE,switch=FALSE,select=FALSE,type.measure=c("deviance","class","auc"),seed=i,xrnet=TRUE) } save(loss,file=paste0(dir,"/results/app_grridge.RData")) writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(dir,"/results/info_app_grridge.txt")) load(paste0(dir,"/results/app_grridge.RData"),verbose=TRUE) table <- as.data.frame(t(sapply(loss,function(x) x$deviance))) table <- (table-table$glmnet)/table$glmnet table <- table[,c("glmnet","transreg.exp.sta","transreg.exp.sim","transreg.iso.sta","transreg.iso.sim","fwelnet","ecpc","xrnet")] sapply(table[,-1],function(x) sum(x> table <- utils::read.csv("data/pone.0181468.s001.csv",header=TRUE,skip=3) extract <- function(cond,y,X,id){ if(length(unique(c(length(cond),length(y),nrow(X),length(id))))!=1){stop("Invalid input.")} n <- table(id,cond)[,"TRUE"] y <- y[cond] X <- X[cond,] id <- id[cond] weights <- rep(1/n,times=n) ids <- unique(id) ys <- sapply(ids,function(x) unique(y[id==x])) foldid <- rep(NA,length=length(ids)) foldid[ys==0] <- sample(rep(1:10,length.out=sum(ys==0))) foldid[ys==1] <- sample(rep(1:10,length.out=sum(ys==1))) foldid <- rep(foldid,times=n[n!=0]) if(length(unique(c(length(y),nrow(X),length(weights),length(foldid))))!=1){ stop("Invalid output.") } return(list(y=y,x=X,weights=weights,foldid=foldid)) } loss <- ridge <- lasso <- list() for(i in 1:10){ cat("---",i,"---\n") set.seed(i) y <- table$LatePE X <- as.matrix(table[,grepl(pattern="SL",x=colnames(table))]) X <- scale(X) min <- sapply(unique(table$ID),function(i) min(table$GA[table$ID==i])) max <- sapply(unique(table$ID),function(i) max(table$GA[table$ID==i])) limit <- 20 group <- stats::rbinom(n=max(table$ID),size=1,prob=0.5) source.id <- which(group==0 | min > limit) target.id <- which(group==1 & min <= limit) if(any(!table$ID %in% c(source.id,target.id))){stop()} if(any(!c(source.id,target.id) %in% table$ID)){stop()} if(any(duplicated(c(source.id,target.id)))){stop()} source <- list() source[[1]] <- extract(cond=(table$ID %in% source.id) & (table$GA<=limit),y=y,X=X,id=table$ID) source[[2]] <- extract(cond=(table$ID %in% source.id),y=y,X=X,id=table$ID) prior <- z <- matrix(NA,nrow=ncol(X),ncol=length(source)) for(j in seq_along(source)){ base <- glmnet::cv.glmnet(y=source[[j]]$y,x=source[[j]]$x, family="binomial",alpha=0, weights=source[[j]]$weights, foldid=source[[j]]$foldid) prior[,j] <- coef(base,s="lambda.min")[-1] z[,j] <- abs(coef(base,s="lambda.min")[-1]) } target <- list() target$y <- sapply(target.id,function(i) unique(y[table$ID==i])) target$x <- t(sapply(target.id,function(i) X[table$ID==i & table$GA==min(table$GA[table$ID==i]),])) loss[[i]] <- transreg:::compare(target=target,prior=prior,family="binomial",alpha=0,scale=c("exp","iso"),sign=FALSE,switch=FALSE,select=FALSE,z=z,type.measure=c("deviance","class","auc"),seed=i,xrnet=TRUE) } save(loss,file=paste0(dir,"/results/app_fwelnet.RData")) writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(dir,"/results/info_app_fwelnet.txt")) load(paste0(dir,"/results/app_fwelnet.RData")) table <- as.data.frame(t(sapply(loss,function(x) x$deviance))) table <- (table-table$glmnet)/table$glmnet table <- table[,c("glmnet","transreg.exp.sta","transreg.exp.sim","transreg.iso.sta","transreg.iso.sim","fwelnet","ecpc","xrnet")] sapply(table[,-1],function(x) sum(x> geno <- read.table(paste0(dir,"/data/vcf_with_pvalue.tab"),header=TRUE) switch <- ifelse(geno$REF==geno$A1_gwas & geno$ALT==geno$A2_gwas,1, ifelse(geno$REF==geno$A2_gwas & geno$ALT==geno$A1_gwas,-1,0)) #prior <- -geno$beta*switch # original effect sizes prior <- log10(geno$p_value)*sign(geno$beta)*switch # pseudo effect sizes pvalue <- geno$p_value # Note: Why are pseudo-effect sizes more suitable as prior effects as compared to original effect sizes? # graphics::plot(x=geno$beta,y=-log10(geno$p_value),xlim=c(-1,1),col=ifelse(stats::p.adjust(geno$p_value)<=0.05,"red","black")) X <- geno[,grepl(pattern="ND",colnames(geno))] X[X=="0/0"] <- 0 X[X=="0/1"] <- 1 X[X=="1/1"] <- 1 X <- sapply(X,as.numeric) X <- t(X) pheno <- read.delim(paste0(dir,"/data/LuxPark_pheno.txt"),sep=" ",header=FALSE) y <- ifelse(pheno$V2==1,0,ifelse(pheno$V2==2,1,NA)); names(y) <- pheno$V1 names <- intersect(names(y[!is.na(y)]),rownames(X)) X <- X[names,]; y <- y[names] # Note: Are prior effects positively correlated with correlation between outcome and features? # cor <- as.numeric(stats::cor(y,X,method="spearman")) # graphics::plot(x=prior,y=cor,col=ifelse(stats::p.adjust(geno$p_value)<=0.05,"red","black")) # graphics::abline(h=0,lty=2,col="grey") # descriptive statistics sum(p.adjust(pvalue,method="BH")<=0.05) sum(p.adjust(pvalue,method="holm")<=0.05) mean(pvalue<=0.05) dim(X) table(y) # memory reduction cond <- pvalue <= 0.05 X <- X[,cond] prior <- prior[cond] pvalue <- pvalue[cond] switch <- switch[cond] save(y,X,prior,pvalue,switch,file=paste0(dir,"/data/app_int_data.RData")) load(paste0(dir,"/data/app_int_data.RData")) power <- seq(from=-2,to=-10,by=-1) cutoff <- 5*10^power frame <- expand.grid(cutoff=cutoff,alpha=0:1,seed=1:10,count=NA) #- - - sequential (start) - - - #loss <- list() #for(i in seq_len(nrow(frame))){ # cat("--- i =",i,"---","\n") # set.seed(frame$seed[i]) # foldid <- transreg:::.folds(y=y,nfolds.ext=10,nfolds.int=10) # cond <- switch!=0 & pvalue < frame$cutoff[i] # loss[[i]] <- transreg:::compare(target=list(y=y,x=X[,cond]),prior=prior[cond],family="binomial",alpha=frame$alpha[i],scale=c("exp","iso"),sign=FALSE,switch=FALSE,select=FALSE,foldid.ext=foldid$foldid.ext,foldid.int=foldid$foldid.int,type.measure=c("deviance","class","auc"),seed=frame$seed[i]) # frame$count[i] <- sum(cond) #} # - - - sequential (end) - - - #- - - parallel (start) - - - frame <- expand.grid(cutoff=cutoff,alpha=0:1,seed=1:10,count=NA) cluster <- snow::makeCluster(8) evaluate <- function(frame,i,switch,pvalue,y,X,prior){ set.seed(frame$seed[i]) foldid <- transreg:::.folds(y=y,nfolds.ext=10,nfolds.int=10) cond <- switch!=0 & pvalue < frame$cutoff[i] transreg:::compare(target=list(y=y,x=X[,cond]),prior=prior[cond],family="binomial",alpha=frame$alpha[i],scale=c("exp","iso"),sign=FALSE,switch=FALSE,select=FALSE,foldid.ext=foldid$foldid.ext,foldid.int=foldid$foldid.int,type.measure=c("deviance","class","auc"),seed=frame$seed[i]) } snow::clusterExport(cl=cluster,list=c("evaluate","frame","switch","pvalue","y","X","prior")) loss <- snow::parSapply(cl=cluster,X=seq_len(nrow(frame)),FUN=function(i) evaluate(frame=frame,i=i,switch=switch,pvalue=pvalue,y=y,X=X,prior=prior),simplify=FALSE) #- - - parallel (end) - - - save(frame,loss,file=paste0(dir,"/results/app_int.RData")) writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(dir,"/results/info_app_int.txt")) ``` The following chunk generates the figures for the internal application. **Requires execution of previous chunk.** - input: results/app_int.RData - execution time: some seconds - output: manuscript/figure_int.pdf, manuscript/figure_temp.pdf ```{r ncerpd_plot,echo=TRUE} #<> plotter <- function(table,cutoff,number,horizontal=FALSE){ graphics::par(mfrow=c(2,2),mar=c(3,1.8,1.0,0.9)) for(scale in c("exp","iso")){ for(alpha in c("0","1")){ graphics::plot.new() graphics::plot.window(xlim=range(log(cutoff)),ylim=range(table)) graphics::box() graphics::title(main=paste(ifelse(alpha==0,"ridge",ifelse(alpha==1,"lasso",NA)),"-",scale),cex.main=1,line=0.2) on <- rep(c(TRUE,FALSE),length.out=length(cutoff)) graphics::axis(side=1,at=log(cutoff),labels=rep("",times=length(on)),cex.axis=0.7) graphics::axis(side=1,at=log(cutoff)[on],labels=paste0(cutoff[on],"\n","(",number[on],")"),cex.axis=0.7) graphics::axis(side=2,cex.axis=0.7) if(horizontal){ graphics::abline(h=0.5,col="grey",lty=2) #graphics::abline(h=unique(table[["mean"]][,alpha]),col="grey",lty=2) } for(i in 1:3){ for(method in c("glmnet",paste0("transreg.",scale,c(".sta",".sim")),"naive")){ lty <- switch(method,"mean"=1,"glmnet"=1,"transreg.exp.sta"=2,"transreg.exp.sim"=2,"transreg.iso.sta"=2,"transreg.iso.sim"=2,"naive"=3) col <- switch(method,"mean"="grey","glmnet"="black","transreg.exp.sta"=rgb(0.2,0.2,1),"transreg.iso.sta"=rgb(0.2,0.2,1),"transreg.exp.sim"=rgb(0,0,0.6),"transreg.iso.sim"=rgb(0,0,0.6),"naive"="red") y <- table[[method]][,alpha] x <- log(as.numeric(names(y))) if(i==1){graphics::lines(x=x,y=y,col=col,lty=lty)} if(i==2){graphics::points(x=x,y=y,col="white",pch=16)} if(i==3){graphics::points(x=x,y=y,col=col)} } } } } } load(paste0(dir,"/data/app_int_data.RData")) load(paste0(dir,"/results/app_int.RData")) frame <- frame[seq_along(loss),colnames(frame)!="seed"] cutoff <- unique(frame$cutoff) number <- unique(sapply(loss,function(x) x$p)) auc <- as.data.frame(t(sapply(loss,function(x) x$auc))) table <- lapply(auc,function(x) tapply(X=x,INDEX=list(frame$cutoff,frame$alpha),FUN=function(x) mean(x))) #grDevices::pdf(file=paste0(dir,"/manuscript/figure_int.pdf"),width=8,height=6,pointsize=15) #grDevices::png(file=paste0(dir,"/manuscript/figure_int.png"),width=8,height=6,units="in",pointsize=15,res=1200) grDevices::postscript(file=paste0(dir,"/manuscript/figure_int.eps"),width=8,height=6,pointsize=15,horizontal=FALSE,paper="special") plotter(table,cutoff,number,horizontal=TRUE) grDevices::dev.off() ``` This chunk obtains the upper $95\%$ confidence interval of a random classifier with $766$ controls and $790$ cases (result: $0.524$). - input: - - execution time: some seconds - output: not saved ```{r ncerpd_auc,eval=FALSE,echo=TRUE,fig.cap="Box plots of $10\\,000$ \\textsc{auc}s ($y$-axis) at different sample sizes ($x$-axis), namely either $50+50$ ('small') or $766+790$ ('large'). Each \\textsc{auc} is calculated from a binary outcome and random predicted probabilities (standard uniform distribution). The red lines separate the top 5\\% of the \\textsc{auc}s from the bottom 95\\% of the \\textsc{auc}s."} #--- empirical computation of confidence interval --- set.seed(1) auc <- list() n <- c("small","large") for(i in seq_along(n)){ auc[[i]] <- numeric() for(j in 1:10000){ if(n[i]=="small"){ y <- rep(c(0,1),times=c(50,50)) } if(n[i]=="large"){ y <- rep(c(0,1),times=c(766,790)) } x <- stats::runif(n=length(y),min=0,max=1) auc[[i]][j] <- pROC::auc(response=y,predictor=x,direction="<",levels=c(0,1)) } } q <- sapply(auc,function(x) quantile(x,probs=0.95)) graphics::par(mar=c(3.5,3.5,1,1)) graphics::plot.new() graphics::plot.window(xlim=c(0.5,length(n)+0.5),ylim=range(auc)) graphics::box() graphics::axis(side=1,at=seq_along(n),labels=n) graphics::axis(side=2) for(i in seq_along(n)){ graphics::boxplot(x=auc[[i]],at=i,add=TRUE) } graphics::abline(h=0.5) graphics::title(xlab="sample size",ylab="AUC",line=2.5) graphics::segments(x0=seq_along(n)-0.2,x1=seq_along(n)+0.2,y0=q,col="red",lwd=2) graphics::text(x=seq_along(n)-0.2,y=q,labels=round(q,digits=3),pos=2,cex=0.5,col="red") q #--- analytical calculation of confidence interval --- # This code is based on the website "Real Statistics using Excel" from Charles Zaiontz, https://real-statistics.com/descriptive-statistics/roc-curve-classification-table/auc-confidence-interval/). var_AUC <- function(x,n1,n2) { q1 = x/(2-x) q2 = 2*x^2/(1+x) var = (x*(1-x) +(n1-1)*(q1-x^2) +(n2-1)*(q2-x^2))/(n1*n2) } round(0.5 + stats::qnorm(p=0.95)*sqrt(var_AUC(0.5,n1=50,n2=50)),digits=3) round(0.5 + stats::qnorm(p=0.95)*sqrt(var_AUC(0.5,n1=766,n2=790)),digits=3) ``` # Further research Additional code for further research (not included in the manuscript): [see source]. ```{r extra_time,eval=FALSE,echo=FALSE} #--- This code chunk is not included in the manuscript! --- # Synthetic example with measurement of computation time. set.seed(1) n0 <- 100 n1 <- 5 n <- n0+n1 p <- 1000 m <- 3 data <- list() for(i in 1:(m+1)){ data[[i]] <- list() data[[i]]$x <- matrix(stats::rnorm(n*p),nrow=n,ncol=p) data[[i]]$y <- stats::rbinom(n=n,size=1,prob=0.5) } source <- data[1:m] names(source) <- paste0("s",1:m) target <- data[[m+1]] prior <- matrix(stats::rnorm(p*m),nrow=p,ncol=m) alpha <- 1 foldid.ext <- rep(c(0,1),times=c(n0,n1)) nfolds.ext <- 1 alpha <- 1 time <- transreg:::compare(target=target,source=source,z=abs(prior),family="binomial",alpha=alpha,xrnet=TRUE,foldid.ext=foldid.ext,nfolds.ext=nfolds.ext,alpha.prior=alpha)$time # As transreg::compare performs exponential and isotonic calibration, we also train transreg with the default parameters. start <- Sys.time() test <- transreg::transreg(y=target$y[seq_len(n0)],X=target$x[seq_len(n0),],prior=prior,family="binomial",alpha=alpha) end <- Sys.time() time["transreg.single"] <- difftime(time1=end,time2=start,units="secs") paste0(paste0(names(time),": ",round(time/time["glmnet"],digits=2)),collapse=", ") ``` ```{r extra_xrnet,echo=TRUE,eval=FALSE} #--- This code chunk is not included in the manuscript! --- # The following chunk performs the additional simulation study with either linearly or non-linearly related prior effects for the comparison of transreg and xrnet. set.seed(1) temp <- matrix(data=NA,nrow=10,ncol=2,dimnames=list(NULL,c("transreg","xrnet"))) mse <- list(linear=temp,nonlinear=temp) for(i in c("linear","nonlinear")){ for(j in 1:10){ # simulate data n0 <- 100; n1 <- 10000; n <- n0 + n1; p <- 2000 X <- matrix(stats::rnorm(n*p),nrow=n,ncol=p) beta <- stats::rnorm(n=p)*stats::rbinom(n=p,size=1,prob=0.05) y <- stats::rnorm(n=n,mean=X %*% beta,sd=1) #sd(X %*% beta) # relation between prior effects and true effects temp <- beta + stats::rnorm(n=p)*stats::rbinom(n=p,size=1,prob=0.01) # temp <- beta + stats::rnorm(p,sd=0.1) # for comparison if(i=="linear"){ prior <- temp } else if(i=="nonlinear"){ prior <- sign(temp)*abs(temp)^2 } # hold-out y_hat <- list() foldid <- rep(c(0,1),times=c(n0,n1)) # transfer learning with transreg model <- transreg::transreg(y=y[foldid==0],X=X[foldid==0,],prior=prior) y_hat$transreg <- predict(model,newx=X[foldid==1,]) # transfer learning with xrnet model <- xrnet::tune_xrnet(x=X[foldid==0,],y=y[foldid==0],external=as.matrix(prior,ncol=1)) y_hat$xrnet <- stats::predict(model,newdata=X[foldid==1,]) # mean squared error (MSE) mse[[i]][j,] <- sapply(y_hat,function(x) mean((x-y[foldid==1])^2)) } } # linear scenario sum(mse$linear[,"xrnet"] < mse$linear[,"transreg"]) stats::wilcox.test(x=mse$linear[,"xrnet"],y=mse$linear[,"transreg"],paired=TRUE)$p.value # non-linear scenario sum(mse$nonlinear[,"transreg"] < mse$nonlinear[,"xrnet"]) stats::wilcox.test(x=mse$nonlinear[,"transreg"],y=mse$nonlinear[,"xrnet"],paired=TRUE)$p.value ``` ```{r extra_devel,eval=FALSE,echo=FALSE} #--- This code chunk is under development! --- # - - - multi-split test - - - tester <- function(y,x){ foldid <- x$foldid pred <- x$pred limit <- 1e-06 pred[pred < limit] <- limit pred[pred > 1 - limit] <- 1 - limit res <- -2 * (y * log(pred) + (1 - y) * log(1 - pred)) method <- paste0("transreg.",c("exp.sta","exp.sim","iso.sta","iso.sim")) pvalue <- matrix(NA,nrow=10,ncol=length(method),dimnames=list(NULL,method)) for(i in seq_len(10)){ for(j in seq_along(method)){ pvalue[i,j] <- stats::wilcox.test(x=res[foldid==i,"glmnet"],y=res[foldid==i,method[j]],paired=TRUE,alternative="greater")$p.value } } return(pvalue) } pvalue <- lapply(loss,function(x) tester(y=y,x=x)) alpha <- c(0,1) method <- paste0("transreg.",c("exp.sta","exp.sim","iso.sta","iso.sim")) median <- array(NA,dim=c(length(alpha),length(cutoff),length(method)),dimnames=list(alpha,cutoff,method)) for(i in seq_along(alpha)){ for(j in seq_along(cutoff)){ median[i,j,] <- apply(do.call(what="rbind",args=pvalue[frame$alpha==alpha[i] & frame$cutoff==cutoff[j]]),2,median) } } 1*(median<=0.05) #- - - estimation stability - - - stability <- function(x,mode="cor"){ if(mode=="cor"){ if(all(is.na(x))){ return(1) } if(sd(x,na.rm=TRUE)==0){ return(1) } cor <- stats::cor(x,method="spearman") cor[is.na(cor)] <- 0 diag(cor) <- NA index <- median(cor,na.rm=TRUE) } else if(mode=="rank"){ top_sep <- apply(x,2,function(x) order(abs(x),decreasing=TRUE)[1:10]) sign_sep <- matrix(sign(x)[cbind(as.numeric(top_sep),rep(1:ncol(x),each=10))],nrow=10,ncol=10) top_all <- order(abs(rowMeans(x)),decreasing=TRUE)[1:10] sign_all <- sign(rowMeans(x))[top_all] sep <- top_sep * sign_sep all <- top_all * sign_all index <- mean(apply(sep,2,function(x) mean(x %in% all))) } return(index) } value <- as.data.frame(t(sapply(loss,function(x) sapply(x$coef,function(x) stability(x,mode="rank"))))) table <- lapply(value,function(x) tapply(X=x,INDEX=list(frame$cutoff,frame$alpha),FUN=function(x) mean(x))) plotter(table,cutoff,number) ``` # Acknowledgements Reformat list of consortium members for acknowledgements: [see source]. ```{r names,eval=FALSE,echo=FALSE} # code for reformatting list of consortium members list <- "Alexander HUNDT 2, Alexandre BISDORFF 5, Amir SHARIFY 2, Anne GRÜNEWALD 1, Anne-Marie HANFF 2, Armin RAUSCHENBERGER 1, Beatrice NICOLAI 3, Brit MOLLENHAUER 12, Camille BELLORA 2, Carlos MORENO 1, Chouaib MEDIOUNI 2, Christophe TREFOIS 1, Claire PAULY 1,3, Clare MACKAY 10, Clarissa GOMES 1, Daniela BERG 11, Daniela ESTEVES 2, Deborah MCINTYRE 2, Dheeraj REDDY BOBBILI 1, Eduardo ROSALES 2, Ekaterina SOBOLEVA 1, Elisa GÓMEZ DE LOPE 1, Elodie THIRY 3, Enrico GLAAB 1, Estelle HENRY 2, Estelle SANDT 2, Evi WOLLSCHEID-LENGELING 1, Francoise MEISCH 1, Friedrich MÜHLSCHLEGEL 4, Gaël HAMMOT 2, Geeta ACHARYA 2, Gelani ZELIMKHANOV 3, Gessica CONTESOTTO 2, Giuseppe ARENA 1, Gloria AGUAYO 2, Guilherme MARQUES 2, Guy BERCHEM 3, Guy FAGHERAZZI 2, Hermann THIEN 2, Ibrahim BOUSSAAD 1, Inga LIEPELT 11, Isabel ROSETY 1, Jacek JAROSLAW LEBIODA 1, Jean-Edouard SCHWEITZER 1, Jean-Paul NICOLAY 19, Jean-Yves FERRAND 2, Jens SCHWAMBORN 1, Jérôme GRAAS 2, Jessica CALMES 2, Jochen KLUCKEN 1,2,3, Johanna TROUET 2, Kate SOKOLOWSKA 2, Kathrin BROCKMANN 11, Katrin MARCUS 13, Katy BEAUMONT 2, Kirsten RUMP 1, Laura LONGHINO 3, Laure PAULY 1, Liliana VILAS BOAS 3, Linda HANSEN 1,3, Lorieza CASTILLO 2, Lukas PAVELKA 1,3, Magali PERQUIN 2, Maharshi VYAS 1, Manon GANTENBEIN 2, Marek OSTASZEWSKI 1, Margaux SCHMITT 2, Mariella GRAZIANO 17, Marijus GIRAITIS 2,3, Maura MINELLI 2, Maxime HANSEN 1,3, Mesele VALENTI 2, Michael HENEKA 1, Michael HEYMANN 2, Michel MITTELBRONN 1,4, Michel VAILLANT 2, Michele BASSIS 1, Michele HU 8, Muhammad ALI 1, Myriam ALEXANDRE 2, Myriam MENSTER 2, Nadine JACOBY 18, Nico DIEDERICH 3, Olena TSURKALENKO 2, Olivier TERWINDT 1,3, Patricia MARTINS CONDE 1, Patrick MAY 1, Paul WILMES 1, Paula Cristina LUPU 2, Pauline LAMBERT 2, Piotr GAWRON 1, Quentin KLOPFENSTEIN 1, Rajesh RAWAL 1, Rebecca TING JIIN LOO 1, Regina BECKER 1, Reinhard SCHNEIDER 1, Rejko KRÜGER 1,2,3, Rene DONDELINGER 5, Richard WADE-MARTINS 9, Robert LISZKA 14, Romain NATI 3, Rosalina RAMOS LIMA 2, Roseline LENTZ 7, Rudi BALLING 1, Sabine SCHMITZ 1, Sarah NICKELS 1, Sascha HERZINGER 1, Sinthuja PACHCHEK 1, Soumyabrata GHOSH 1, Stefano SAPIENZA 1, Sylvia HERBRINK 6, Tainá MARQUES 1, Thomas GASSER 11, Ulf NEHRBASS 2, Valentin GROUES 1, Venkata SATAGOPAM 1, Victoria LORENTZ 2, Walter MAETZLER 15, Wei GU 1, Wim AMMERLANN 2, Yohan JAROZ 1, Zied LANDOULSI 1" list <- strsplit(list,split=", ")[[1]] number <- gsub(pattern="[^0-9,]",replacement="",x=list) names <- gsub(pattern="[0-9,]",replacement="",x=list) for(i in seq_along(names)){ if(substring(text=names[i],first=1,last=1)==" "){ names[i] <- substring(text=names[i],first=2,last=nchar(names[i])) } if(substring(text=names[i],first=nchar(names[i]),last=nchar(names[i]))==" "){ names[i] <- substring(text=names[i],first=1,last=nchar(names[i])-1) } } names <- strsplit(names,split=" ") for(i in seq_along(names)){ cond <- grepl(pattern="[a-z]",x=names[[i]]) first <- paste0(names[[i]][cond],collapse=" ") last <- paste0(tolower(names[[i]][!cond]),collapse=" ") names[[i]] <- paste0(first," \textsc{",last,"}","$^{",number[i],"}$") } paste0(names,collapse=", ") inst <- "1 Luxembourg Centre for Systems Biomedicine, University of Luxembourg, Esch-sur-Alzette, Luxembourg; 2 Luxembourg Institute of Health, Strassen, Luxembourg; 3 Centre Hospitalier de Luxembourg, Strassen, Luxembourg; 4 Laboratoire National de Santé, Dudelange, Luxembourg; 5 Centre Hospitalier Emile Mayrisch, Esch-sur-Alzette, Luxembourg; 6 Centre Hospitalier du Nord, Ettelbrück, Luxembourg; 7 Parkinson Luxembourg Association, Leudelange, Luxembourg; 8 Oxford Parkinson's Disease Centre, Nuffield Department of Clinical Neurosciences, University of Oxford, Oxford, UK; 9 Oxford Parkinson's Disease Centre, Department of Physiology, Anatomy and Genetics, University of Oxford, Oxford, UK; 10 Oxford Centre for Human Brain Activity, Wellcome Centre for Integrative Neuroimaging, Department of Psychiatry, University of Oxford, Oxford, UK; 11 Center of Neurology and Hertie Institute for Clinical Brain Research, Department of Neurodegenerative Diseases, University Hospital Tübingen, Tübingen, Germany; 12 Paracelsus-Elena-Klinik, Kassel, Germany; 13 Ruhr-University of Bochum, Bochum, Germany; 14 Westpfalz-Klinikum GmbH, Kaiserslautern, Germany; 15 Department of Neurology, University Medical Center Schleswig-Holstein, Kiel, Germany; 16 Department of Neurology Philipps, University Marburg, Marburg, Germany; 17 Association of Physiotherapists in Parkinson's Disease Europe, Esch-sur-Alzette, Luxembourg; 18 Private practice, Ettelbruck, Luxembourg; 19 Private practice, Luxembourg, Luxembourg" list <- strsplit(inst,split="; ")[[1]] number <- gsub(pattern="[^0-9]",replacement="",x=list) name <- gsub(pattern="[0-9]",replacement="",x=list) name <- substring(name,first=2,last=nchar(name)) paste(paste0("$^{",number,"}$",name),collapse=", ") ``` # Session information Print session information. ```{r sessionInfo,echo=FALSE,results="asis"} info <- devtools::session_info() knitr::kable(data.frame(setting=names(info[[1]]),value=unlist(info[[1]]),row.names=NULL)) knitr::kable(info[[2]][,c("package","loadedversion","date","source")]) ```