1 Goals

Lay out some reproducible code for doing lasso, group lasso, and calculate permuation p-values for coefficient estimates, using tidy functions and code throughout. Also, some pretty visualizations.

# load relevant libraries for this tutorial
library(tidyverse)
library(tidymodels)
library(glmnet)
library(reshape2)
library(gt)

theme_set(theme_bw()) #setting ggplot theme 

2 Data

Load and prepare your data, do some basic summary like table 1 and maybe heatmap of values and/or correlations.

This data, included in the glmnet package, has binary outcome and continuous variables.

# load example data from glmnet package
data("BinomialExample")

# making variable names for this example data since it didn't come with them
test_list <- c()
for (i in 1:30){
  test_list[i] <- paste0("var", i)
}

colnames(BinomialExample$x) <- test_list

# reducing number of variables for demonstration purposes
#BinomialExample$x <- BinomialExample$x[,c(1:20)]

head(BinomialExample$x)
##            var1       var2        var3       var4       var5       var6
## [1,] -0.6192614 0.01624409 -0.62606831  0.4126846  0.4944374 -0.4493269
## [2,]  1.0942728 0.47257285 -1.33714704 -0.6405813  0.2823199 -0.6093321
## [3,] -0.3567040 0.30121334  0.19056192  0.2340268  0.1698086  1.2291427
## [4,] -2.4690701 2.84771447  1.66024352  1.5688130 -0.8330570 -0.5620088
## [5,]  0.5672885 0.88888747 -0.01158671  0.5762753 -0.8689453 -0.3132571
## [6,]  0.9129254 0.77350086  0.55836355 -0.5350992  0.3507093 -0.5763021
##            var7        var8       var9      var10      var11      var12
## [1,]  0.6760053 -0.06771419 -1.4747031  1.0759178  0.5706039 -0.0986473
## [2,]  0.3547232 -0.62686515 -0.1157266 -0.3127071  0.4138668 -1.8716997
## [3,]  1.1628095  0.88024242  0.7616431 -1.1808041 -0.4876853  0.4565005
## [4,] -0.6142455 -1.76529838  0.3902346  1.7940062  0.4644753  0.5168103
## [5,]  0.6902907 -1.29961200 -0.9673640 -0.2732322 -0.5239595 -0.8215918
## [6,] -0.3882672  0.55518663  0.3569562  1.3629640 -0.7819931  0.6053378
##           var13      var14      var15       var16      var17      var18
## [1,]  1.2099853 -1.0672363  0.3432912 -0.02255467 -0.1020671  0.1413962
## [2,]  0.1640714  0.7721060 -0.8625086  1.46205409 -0.4369797 -1.0405764
## [3,]  0.6450245  1.7527617  0.4900347  2.27150850 -0.7340858  0.9766313
## [4,] -1.0808010 -0.2980940 -0.5849256 -1.35886288 -0.6594930 -1.1992441
## [5,]  0.5460573 -0.5151076 -0.1362184 -1.11128459 -0.9784008 -1.3778476
## [6,] -0.4721640  1.2556097 -1.1859365 -2.60167966 -0.3492289  0.1621008
##           var19       var20      var21      var22       var23      var24
## [1,] -0.5493577 -0.20168028  0.4566007  0.8072802  0.01473546 -1.1160968
## [2,] -0.6027103 -0.32056948  0.2124757  0.6129102 -0.81898245  2.9037142
## [3,] -0.3584639 -0.08005161  1.2452957  1.0118766  0.03927982  1.0172751
## [4,]  0.2016217  0.50707911  0.6727465 -0.4251484 -0.98728378 -0.4012739
## [5,] -1.3516700  0.95926166  0.4919155 -1.3400220 -0.13288456  0.2197023
## [6,] -0.8703520 -1.74848438 -0.2026351  1.7068849 -1.51250049  1.1769298
##            var25       var26      var27       var28      var29      var30
## [1,] -0.01587617  1.14549082  0.9417529  1.43715303 -0.9278330  0.9080937
## [2,] -0.15783099 -2.10469356  0.6408442  0.01236235  0.4811105 -2.0492817
## [3,]  0.29705289  0.40180292  1.9246469  0.31108274  0.8983029 -1.2089006
## [4,] -2.06615311 -0.55248745 -1.3790266  0.09518939 -0.3897106  1.7796744
## [5,]  0.78974655 -0.14745205 -0.8018895  0.77559612  1.1184595 -1.1970731
## [6,]  1.66830193  0.06319776  0.2907151 -0.73832848 -0.4088768 -1.0545311
head(BinomialExample$y)
## [1] 0 1 1 0 1 0

3 Fit Observed Data

Use usual cross-validation procedure to find best lambda for lasso model.

cvfit <- cv.glmnet(BinomialExample$x, BinomialExample$y, 
                 family = "binomial",
                 type.measure = "class")

plot(cvfit)

coef(cvfit, s="lambda.min")
## 31 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  0.228147565
## var1         .          
## var2         0.435137211
## var3        -0.350315533
## var4        -0.884181276
## var5        -0.103249655
## var6        -0.660381219
## var7         .          
## var8        -0.364938882
## var9         0.480723929
## var10       -1.028074339
## var11       -0.011111029
## var12        .          
## var13        .          
## var14        .          
## var15        .          
## var16        0.117021589
## var17        .          
## var18        .          
## var19        .          
## var20        .          
## var21        .          
## var22        0.153471053
## var23        0.214900782
## var24        .          
## var25        0.440629631
## var26       -0.251871772
## var27       -0.003211245
## var28        0.112238670
## var29       -0.128779957
## var30        .

If you want, use some tidyverse functions to extract and present the results.

# get some tidy results from cv.glmnet
tidied_cv <- tidy(cvfit)
glance_cv <- glance(cvfit)

tidied <- tidy(cvfit$glmnet.fit)

# and plot them (but not the intercept)
tidied %>%
  filter(term != "(Intercept)") %>%
ggplot( aes(lambda, estimate, group = term, color = term)) +
  scale_x_log10() +
  geom_line() +
  geom_vline(xintercept = glance_cv$lambda.min)  # vertical line for min lambda

4 Permutation p-value calculation for estimates

Since cv.glmnet does not provide p-values for estimates, you can do a permutation test to get empiric p-values.

In this procedure, we will scramble the y vector a set number of times (say, 1000), do the cv.glmnet() function with the scrambled outcome, and extract the estimates from the model.

Then, we will compare those estimates and calculate a p-value for each variable by summing the number of permuted estimates which are as or more extreme as the observed estimate, and dived by number of permutations.

Then we will adjust for multiple comparisons and present a final table of results.

# this function will take your input of x and y and return tidy results from the minimum lambda cv result
get_cv_result <- function(x, y){
  cvfit <- cv.glmnet(x, y, 
                 family = "binomial",
                 type.measure = "class")
  tidied_cv <- tidy(cvfit)
  glance_cv <- glance(cvfit)
  
  tidied <- tidy(cvfit$glmnet.fit)
  
  allvars <- data.frame(term = colnames(x))
  
  tidied_min <- tidied %>%
    filter(lambda == cvfit$lambda.min) %>%
    right_join(allvars, by = "term") %>% #join to make sure you don't drop vars that went to zero in lasso estimation
    select(term, estimate) %>%
    mutate(term = factor(term, levels= str_sort(term, numeric=TRUE))) %>%
    arrange(term) %>%
    replace(is.na(.), 0)
  return(tidied_min)
}
# get tidy result from observed data
observed_result <- get_cv_result(BinomialExample$x, BinomialExample$y)

# specify variable names and number of permutations
variable_names <- colnames(BinomialExample$x)
num_permutations = 100

# set seed for reproducibility
set.seed(1219)

#set up loop for permutation results
perm_results<-vector('list',num_permutations)
perm_results_bigger <- vector('list',num_permutations)
for(i in 1:num_permutations){
  perm_y <- sample(BinomialExample$y)
  res <- get_cv_result(BinomialExample$x, perm_y)
  test <- left_join(observed_result, res, 
                  by = "term",
                  suffix = c(".obs", ".perm")) %>%
    # calculating if permuation estimate is greater than or equal to observed estimate
    mutate(bigger = as.numeric(abs(estimate.perm) >= abs(estimate.obs)))
  perm_results[[i]]<-res$estimate 
  perm_results_bigger[[i]] <- test$bigger
}
#make nice dataframe from results to present them
final_results <- bind_cols(perm_results_bigger)  %>%
  mutate(sum = rowSums(across(where(is.numeric))),
         # calculate p-value which is sum of times that permuted val is >= observed value, divided by number of permutations
         perm_pval = sum/num_permutations,
         term = observed_result$term,
         estimate = round(observed_result$estimate, 3)) %>%
  select(term, estimate, perm_pval) %>%
  # and if you want to adjust for multiple testing you can do FDR correction like this, just specify the method. Here I'm using Benjamini Hochberg 
  mutate(
    qval = round(p.adjust(perm_pval, method = "BH"),2)
  )

# use gt to display results in a table
gt::gt(final_results)
term estimate perm_pval qval
var1 0.000 1.00 1.00
var2 0.496 0.01 0.04
var3 -0.411 0.02 0.07
var4 -0.953 0.00 0.00
var5 -0.123 0.14 0.30
var6 -0.739 0.00 0.00
var7 0.000 1.00 1.00
var8 -0.412 0.01 0.04
var9 0.566 0.00 0.00
var10 -1.137 0.00 0.00
var11 -0.017 0.18 0.36
var12 -0.006 0.24 0.40
var13 0.000 1.00 1.00
var14 0.000 1.00 1.00
var15 0.000 1.00 1.00
var16 0.185 0.08 0.22
var17 0.000 1.00 1.00
var18 0.000 1.00 1.00
var19 0.000 1.00 1.00
var20 0.000 1.00 1.00
var21 0.000 1.00 1.00
var22 0.168 0.13 0.30
var23 0.255 0.06 0.18
var24 0.000 1.00 1.00
var25 0.507 0.01 0.04
var26 -0.276 0.04 0.13
var27 -0.038 0.21 0.37
var28 0.150 0.19 0.36
var29 -0.158 0.13 0.30
var30 0.000 1.00 1.00

5 Histogram of results

df <- data.frame(t(bind_cols(perm_results)))
colnames(df) <- variable_names

Here we can display the permutation results from one of the variables that did not have a significant lasso coefficient. The green bars are a histogram of the estimates that came from the permutations. The vertical line is the observed coefficient estimate.

observed_val <- final_results %>% filter(term == "var1") %>% select(estimate)

ggplot(df, aes(x=var1)) +
  geom_histogram(color="black", fill="green", alpha=.4) +
  geom_vline(color="navy",lwd=1,lty=2,xintercept = observed_val$estimate) +
  theme_classic()+
  ggtitle("Lasso Coefficient Estimates for Var1 from \n 100 Permutations")

Here we can display the permutation results from one of the variables that DID have a significant lasso coefficient. We see the vertical line of observed value is far from the permuted results.

observed_val <- final_results %>% filter(term == "var4") %>% select(estimate)

ggplot(df, aes(x=var4)) +
  geom_histogram(color="black", fill="green", alpha=.4) +
  geom_vline(color="navy",lwd=1,lty=2,xintercept = observed_val$estimate) +
  theme_classic()+
  ggtitle("Lasso Coefficient Estimates for Var4 from \n 100 Permutations")

6 Correlation matrix heatmap

We discussed there was interest in plotting correlation matrix as a heatmap, here’s how you would do that with the data.

# following exmaple from http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization

library(reshape2)
cormat <- round(cor(BinomialExample$x),2)
head(cormat)
##       var1  var2  var3  var4  var5  var6  var7  var8  var9 var10 var11 var12
## var1  1.00 -0.02 -0.13  0.11 -0.04 -0.11  0.11  0.04 -0.04  0.01 -0.09 -0.01
## var2 -0.02  1.00 -0.01 -0.16 -0.22 -0.02 -0.21 -0.14 -0.04  0.10 -0.04  0.21
## var3 -0.13 -0.01  1.00  0.00  0.19 -0.08 -0.12  0.06  0.16  0.09 -0.11  0.04
## var4  0.11 -0.16  0.00  1.00  0.19  0.15 -0.04  0.07  0.15  0.12  0.01 -0.10
## var5 -0.04 -0.22  0.19  0.19  1.00  0.14 -0.03  0.06 -0.01  0.13 -0.01 -0.06
## var6 -0.11 -0.02 -0.08  0.15  0.14  1.00  0.17 -0.08  0.13 -0.10  0.08  0.09
##      var13 var14 var15 var16 var17 var18 var19 var20 var21 var22 var23 var24
## var1  0.02  0.21 -0.06  0.00  0.00  0.08  0.06 -0.17 -0.05  0.03 -0.02  0.10
## var2 -0.01  0.03 -0.01 -0.15  0.00  0.06  0.10 -0.07  0.02  0.05 -0.09 -0.11
## var3  0.17  0.03 -0.12  0.03  0.13 -0.05 -0.13 -0.16  0.08 -0.01 -0.21  0.06
## var4 -0.16  0.05 -0.02  0.13  0.00 -0.08 -0.13  0.19  0.03 -0.18  0.00 -0.17
## var5  0.21  0.17  0.10  0.20  0.03  0.00 -0.09  0.01  0.08 -0.19  0.01 -0.10
## var6 -0.02  0.15  0.01  0.01 -0.03  0.12 -0.06  0.17  0.00  0.07  0.01  0.00
##      var25 var26 var27 var28 var29 var30
## var1  0.08  0.02  0.08  0.10 -0.09 -0.14
## var2  0.17 -0.21 -0.10  0.18 -0.15  0.02
## var3 -0.02 -0.03  0.09 -0.09  0.12 -0.13
## var4 -0.04  0.10  0.10 -0.08 -0.01  0.08
## var5  0.00  0.16  0.08  0.01  0.11  0.13
## var6 -0.02  0.15  0.12 -0.06  0.07  0.11
get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
  }
upper_tri <- get_upper_tri(cormat)
head(upper_tri)
##      var1  var2  var3  var4  var5  var6  var7  var8  var9 var10 var11 var12
## var1    1 -0.02 -0.13  0.11 -0.04 -0.11  0.11  0.04 -0.04  0.01 -0.09 -0.01
## var2   NA  1.00 -0.01 -0.16 -0.22 -0.02 -0.21 -0.14 -0.04  0.10 -0.04  0.21
## var3   NA    NA  1.00  0.00  0.19 -0.08 -0.12  0.06  0.16  0.09 -0.11  0.04
## var4   NA    NA    NA  1.00  0.19  0.15 -0.04  0.07  0.15  0.12  0.01 -0.10
## var5   NA    NA    NA    NA  1.00  0.14 -0.03  0.06 -0.01  0.13 -0.01 -0.06
## var6   NA    NA    NA    NA    NA  1.00  0.17 -0.08  0.13 -0.10  0.08  0.09
##      var13 var14 var15 var16 var17 var18 var19 var20 var21 var22 var23 var24
## var1  0.02  0.21 -0.06  0.00  0.00  0.08  0.06 -0.17 -0.05  0.03 -0.02  0.10
## var2 -0.01  0.03 -0.01 -0.15  0.00  0.06  0.10 -0.07  0.02  0.05 -0.09 -0.11
## var3  0.17  0.03 -0.12  0.03  0.13 -0.05 -0.13 -0.16  0.08 -0.01 -0.21  0.06
## var4 -0.16  0.05 -0.02  0.13  0.00 -0.08 -0.13  0.19  0.03 -0.18  0.00 -0.17
## var5  0.21  0.17  0.10  0.20  0.03  0.00 -0.09  0.01  0.08 -0.19  0.01 -0.10
## var6 -0.02  0.15  0.01  0.01 -0.03  0.12 -0.06  0.17  0.00  0.07  0.01  0.00
##      var25 var26 var27 var28 var29 var30
## var1  0.08  0.02  0.08  0.10 -0.09 -0.14
## var2  0.17 -0.21 -0.10  0.18 -0.15  0.02
## var3 -0.02 -0.03  0.09 -0.09  0.12 -0.13
## var4 -0.04  0.10  0.10 -0.08 -0.01  0.08
## var5  0.00  0.16  0.08  0.01  0.11  0.13
## var6 -0.02  0.15  0.12 -0.06  0.07  0.11
melted_cormat <- melt(upper_tri, na.rm = TRUE)
head(melted_cormat)
##    Var1 Var2 value
## 1  var1 var1  1.00
## 31 var1 var2 -0.02
## 32 var2 var2  1.00
## 61 var1 var3 -0.13
## 62 var2 var3 -0.01
## 63 var3 var3  1.00
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()