## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----echo=TRUE, include=TRUE, warning=FALSE, message=FALSE------------------- library(RegrCoeffsExplorer) library(gridExtra) library(glmnet) library(selectiveInference) library(dplyr) library(faraway) library(MASS) library(psych) library(ggplot2) library(rlang) ## ----------------------------------------------------------------------------- # help(mtcars) df_mtcars=as.data.frame(mtcars) df_mtcars[c("cyl","vs","am","gear")] = lapply(df_mtcars[c("cyl","vs","am","gear")] , factor) # convert to factor head(df_mtcars) lm_object=lm(mpg~cyl+hp+wt+disp+vs+am+carb,data=df_mtcars) summary(lm_object) ## ----fig.height=6,fig.width=12, warning=F------------------------------------- grid.arrange(vis_reg(lm_object)$"SidebySide") ## ----fig.height=6,fig.width=7, warning=F-------------------------------------- vis_reg(lm_object, intercept=T)$"PerUnitVis" ## ----fig.height=6,fig.width=7, warning=F-------------------------------------- vis_reg(lm_object, intercept=T, CI=T)$"PerUnitVis" ## ----fig.height=6,fig.width=12, warning=F------------------------------------- grid.arrange(vis_reg( lm_object, CI=T,palette=c("palegreen4","tomato1"), eff_size_diff=c(1,5), title=c( "Regression - Unit Change", "Regression - Effective Change (min --> max)" ) )$"SidebySide" ) ## ----fig.height=6,fig.width=7, warning=F-------------------------------------- # obtain coefficients for vs and wt vline1=lm_object$coefficients['vs1'][[1]] vline2=lm_object$coefficients['wt'][[1]] vis_reg(lm_object)$"PerUnitVis"+ geom_hline(yintercept=vline1, linetype="dashed", color = "blue", size=1)+ # add a vertical line geom_hline(yintercept=vline2, linetype="dashed", color = "orange", size=1)+ ggtitle("Visualization of Regression Results (per unit change)")+ ylim(-5,5)+ # note the coordinate flip xlab("aspects")+ ylab("coefficients")+ theme_bw()+ scale_fill_manual(values = c("black","pink" ))+ # change mappings theme(plot.title = element_text(hjust = 0.5)) # place title in the center ## ----------------------------------------------------------------------------- # ?hsb glm_object=glm( I(prog == "academic") ~ gender +math+ read + write + science + socst, family = binomial(link="logit"), data = faraway::hsb) summary(glm_object) ## ----fig.height=6,fig.width=12, warning=F------------------------------------- grid.arrange(vis_reg( glm_object, CI=T, alpha=0.01 )$"SidebySide" ) ## ----------------------------------------------------------------------------- df_glmnet=data.frame(Cars93) df_glmnet[sample(dim(df_glmnet)[1], 5), ] # examine 5 randomly selected rows levels(df_glmnet$Origin) # check level attributes df_glmnet=df_glmnet %>% mutate(MPG.avg = (MPG.city + MPG.highway) / 2) # calculate average MPG ## ----fig.height=6,fig.width=7, warning=F-------------------------------------- y_lasso=df_glmnet$Price x_lasso=model.matrix( as.formula(paste("~", paste(c("MPG.avg","Horsepower","RPM","Wheelbase", "Passengers","Length", "Width", "Weight", "Origin","Man.trans.avail" ), collapse = "+" ),sep = "" ) ), data=df_glmnet ) x_lasso = x_lasso[, -1] # remove intercept ndim_lasso=dim(x_lasso)[1] cv_model_lasso = cv.glmnet(x_lasso, y_lasso, family="gaussian", alpha=1) # LASSO regression # extract value of lambda that gives minimum mean cross-validated error best_lambda_lasso = cv_model_lasso$lambda.min plot(cv_model_lasso) best_model_lasso = glmnet(x_lasso, y_lasso, family="gaussian", alpha=1, lambda=best_lambda_lasso) coefficients(best_model_lasso) ## ----fig.height=8,fig.width=8, warning=F-------------------------------------- df_glmnet_num=df_glmnet%>%select_if(function(x) is.numeric(x)) cols_to_select = c("MPG.avg","Horsepower","RPM","Wheelbase","Passengers", "Length", "Width", "Weight") df_glmnet_num=df_glmnet_num %>%select(all_of(cols_to_select)) corPlot(df_glmnet_num,xlas=2) ## ----fig.height=6,fig.width=12, warning=F------------------------------------- grid.arrange(vis_reg(best_model_lasso,eff_size_diff=c(1,3), # Q2 - minimum glmnet_fct_var="Originnon-USA")$"SidebySide") # note the naming pattern for categorical variables ## ----fig.height=6,fig.width=12, warning=F------------------------------------- plt_1=vis_reg(best_model_lasso,eff_size_diff=c(1,3), glmnet_fct_var="Originnon-USA")$"PerUnitVis"+ ggtitle("Visualization of CV.GLMNET Results (per unit change)")+ ylim(-4,4)+ xlab("Car characteristics")+ ylab("LASSO coefficients")+ theme_bw()+ scale_fill_manual(values = c("red","whitesmoke" ))+ theme(plot.title = element_text(hjust = 0.5)) plt_2=vis_reg(best_model_lasso, eff_size_diff=c(1,3), glmnet_fct_var="Originnon-USA")$"RealizedEffectVis"+ ggtitle("Visualization of CV.GLMNET Results (effective:min --> Q2)")+ ylim(-15,15)+ xlab("Car characteristics")+ ylab("LASSO coefficients")+ theme_bw()+ scale_fill_manual(values = c("maroon1","palegreen1" ))+ theme(plot.title = element_text(hjust = 0.5)) plt_3=arrangeGrob(plt_1,plt_2, nrow=1, widths = c(1,1)) grid.arrange(plt_3) ## ----------------------------------------------------------------------------- # ?jasa heart_df=as.data.frame(survival::jasa) heart_df_filtered = heart_df %>%filter(!rowSums(is.na(.))) # remove rows containing NaN values # check last 6 rows of the data frame tail(heart_df_filtered) ## ----fig.height=6,fig.width=12, warning=F------------------------------------- # filtered data only contains patients who received a transplant, sum(heart_df_filtered$transplant!=1) # mismatch scores are weakly correlated, print('Correlation between mismatch scores:') cor(heart_df_filtered$mscore,heart_df_filtered$mismatch) # if rejection occurs, the death is certain, at least, in this data set heart_cont_table=table(heart_df_filtered$reject,heart_df_filtered$fustat) dimnames(heart_cont_table) =list( Reject = c("No", "Yes"), Status = c("Alive", "Deceased") ) heart_cont_table # 'age' is skewed variable with a very big range paste("Range of '\ age \' variable is : ",diff(range(heart_df_filtered$age))) old_par = par() par(mfrow=c(2,2)) hist(heart_df_filtered$age, main="Histogram of Age", xlab="age") boxplot(heart_df_filtered$age,main="Boxplot of Age", ylab="age") hist(sqrt(heart_df_filtered$age),main="Histogram of transformed data", xlab="Sqrt(age)") boxplot(sqrt(heart_df_filtered$age),main="Boxplot of transformed data", ylab="Sqrt(age)") par(old_par) ## ----------------------------------------------------------------------------- # observe that age variable is not rounded # it is calculated in the following manner age_calc_example=difftime(heart_df_filtered$accept.dt, heart_df_filtered$birth.dt,units = "days")/365.25 # check the first calculated value age_calc_example[1]==heart_df_filtered[1,]$age # check randomly selected value n_samp=sample(dim(heart_df_filtered)[1],1) age_calc_example[n_samp]==heart_df_filtered[n_samp,]$age # check 5 point summary heart_df_filtered$age%>%summary() # check 5 point summary for data rounded down to the nearest integer heart_df_filtered$age%>%floor()%>%summary() ## ----------------------------------------------------------------------------- # reject categorical variable in not included due to the reason previously stated heart_df_filtered = heart_df_filtered %>% mutate(across(all_of(c("surgery")), as.factor)) # apply 'sqrt()' transformation to 'age' variable heart_df_filtered$sqrt.age=sqrt(heart_df_filtered$age) y_heart=heart_df_filtered$fustat x_heart=model.matrix(as.formula(paste("~", paste(c("sqrt.age" ,"mismatch","mscore", "surgery"),collapse = "+"), sep = "")), data=heart_df_filtered) x_heart=x_heart[, -1] x_heart_orig=x_heart # save original data set x_heart=scale(x_heart,T,T) gfit_heart = cv.glmnet(x_heart,y_heart,standardize=F,family="binomial") lambda_heart=gfit_heart$lambda.min n_heart=dim(x_heart)[1] beta_hat_heart=coef(gfit_heart, x=x_heart, y=y_heart, s=lambda_heart, exact=T) # note that lambda should be multiplied by the number of rows out_heart = fixedLassoInf(x_heart,y_heart,beta_hat_heart,lambda_heart*n_heart, family="binomial") #check the output out_heart # note the class class(out_heart) ## ----------------------------------------------------------------------------- # back transformation logic x_heart_reconstructed = t(apply(x_heart, 1, function(x) x*attr(x_heart,'scaled:scale') + attr(x_heart, 'scaled:center'))) # check all.equal(x_heart_orig,x_heart_reconstructed) # same via a function x_heart_reconstructed.2=detransform(x_heart) all.equal(x_heart_orig,x_heart_reconstructed.2) ## ----fig.height=6,fig.width=12, warning=F------------------------------------- grid.arrange(vis_reg(out_heart, CI=T, glmnet_fct_var=c("surgery1"), round_func="none",eff_size_diff=c(1,3))$"SidebySide" ) ## ----------------------------------------------------------------------------- set.seed(43) n = 50 p = 10 sigma = 1 x = matrix(rnorm(n*p),n,p) x=scale(x,TRUE,TRUE) beta = c(3,2,rep(0,p-2)) y = x%*%beta + sigma*rnorm(n) pf=c(rep(1,7),rep(.1,3)) #define penalty factors pf=p*pf/sum(pf) # penalty factors should be rescaled so they sum to p xs=scale(x,FALSE,pf) #scale cols of x by penalty factors # first run glmnet gfit = glmnet(xs, y, standardize=FALSE) # extract coef for a given lambda; note the 1/n factor! # (and we don't save the intercept term) lambda = .8 beta_hat = coef(gfit, x=xs, y=y, s=lambda/n, exact=TRUE)[-1] # compute fixed lambda p-values and selection intervals out = fixedLassoInf(xs,y,beta_hat,lambda,sigma=sigma) #rescale conf points to undo the penalty factor out$ci=t(scale(t(out$ci),FALSE,pf[out$vars])) out ## ----------------------------------------------------------------------------- pf_heart=c(0.3, 0.1,0.1,0.1) p_l=length(pf_heart) pf_heart=p_l*pf_heart/sum(pf_heart) xs_heart_res=scale(x_heart,FALSE,pf_heart) # note that the data is being scaled again gfit_heart_pef_fac_res = cv.glmnet(xs_heart_res, y_heart, standardize=FALSE, family="binomial") lambda_heart_pef_fac_res=gfit_heart_pef_fac_res$lambda.min beta_hat_heart_res=coef(gfit_heart_pef_fac_res, x=xs_heart_res, y=y_heart, s=lambda_heart_pef_fac_res, exact=F) out_heart_res = fixedLassoInf(xs_heart_res,y_heart,beta_hat_heart_res, lambda_heart_pef_fac_res*n_heart,family="binomial") out_heart_res$ci=t(scale(t(out_heart_res$ci),FALSE,pf_heart[out_heart_res$vars])) out_heart_res ## ----fig.height=6,fig.width=12, warning=F------------------------------------- x_heart_test_3=detransform(xs_heart_res, attr_center=NULL) x_heart_test_3=detransform(x_heart_test_3, attr_scale=attr(x_heart, 'scaled:scale'), attr_center=attr(x_heart, 'scaled:center') ) # check all.equal(x_heart_test_3,x_heart_orig) ## ----fig.height=6,fig.width=12,warning=F-------------------------------------- # note that case_penalty=T and x_data_orig must be specified # effective change between Q1(2) and max(5) grid.arrange(vis_reg(out_heart_res, CI=T, glmnet_fct_var=c("surgery1"), case_penalty=T, x_data_orig=x_heart_orig, eff_size_diff=c(2,5))$"SidebySide")