Lay out some reproducible code for doing lasso, group lasso, and calculate permuation p-values for coefficient estimates, using tidy functions and code throughout. Also, some pretty visualizations.
# load relevant libraries for this tutorial
library(tidyverse)
library(tidymodels)
library(glmnet)
library(reshape2)
library(gt)
theme_set(theme_bw()) #setting ggplot theme
Load and prepare your data, do some basic summary like table 1 and maybe heatmap of values and/or correlations.
This data, included in the glmnet package, has binary outcome and continuous variables.
# load example data from glmnet package
data("BinomialExample")
# making variable names for this example data since it didn't come with them
test_list <- c()
for (i in 1:30){
test_list[i] <- paste0("var", i)
}
colnames(BinomialExample$x) <- test_list
# reducing number of variables for demonstration purposes
#BinomialExample$x <- BinomialExample$x[,c(1:20)]
head(BinomialExample$x)
## var1 var2 var3 var4 var5 var6
## [1,] -0.6192614 0.01624409 -0.62606831 0.4126846 0.4944374 -0.4493269
## [2,] 1.0942728 0.47257285 -1.33714704 -0.6405813 0.2823199 -0.6093321
## [3,] -0.3567040 0.30121334 0.19056192 0.2340268 0.1698086 1.2291427
## [4,] -2.4690701 2.84771447 1.66024352 1.5688130 -0.8330570 -0.5620088
## [5,] 0.5672885 0.88888747 -0.01158671 0.5762753 -0.8689453 -0.3132571
## [6,] 0.9129254 0.77350086 0.55836355 -0.5350992 0.3507093 -0.5763021
## var7 var8 var9 var10 var11 var12
## [1,] 0.6760053 -0.06771419 -1.4747031 1.0759178 0.5706039 -0.0986473
## [2,] 0.3547232 -0.62686515 -0.1157266 -0.3127071 0.4138668 -1.8716997
## [3,] 1.1628095 0.88024242 0.7616431 -1.1808041 -0.4876853 0.4565005
## [4,] -0.6142455 -1.76529838 0.3902346 1.7940062 0.4644753 0.5168103
## [5,] 0.6902907 -1.29961200 -0.9673640 -0.2732322 -0.5239595 -0.8215918
## [6,] -0.3882672 0.55518663 0.3569562 1.3629640 -0.7819931 0.6053378
## var13 var14 var15 var16 var17 var18
## [1,] 1.2099853 -1.0672363 0.3432912 -0.02255467 -0.1020671 0.1413962
## [2,] 0.1640714 0.7721060 -0.8625086 1.46205409 -0.4369797 -1.0405764
## [3,] 0.6450245 1.7527617 0.4900347 2.27150850 -0.7340858 0.9766313
## [4,] -1.0808010 -0.2980940 -0.5849256 -1.35886288 -0.6594930 -1.1992441
## [5,] 0.5460573 -0.5151076 -0.1362184 -1.11128459 -0.9784008 -1.3778476
## [6,] -0.4721640 1.2556097 -1.1859365 -2.60167966 -0.3492289 0.1621008
## var19 var20 var21 var22 var23 var24
## [1,] -0.5493577 -0.20168028 0.4566007 0.8072802 0.01473546 -1.1160968
## [2,] -0.6027103 -0.32056948 0.2124757 0.6129102 -0.81898245 2.9037142
## [3,] -0.3584639 -0.08005161 1.2452957 1.0118766 0.03927982 1.0172751
## [4,] 0.2016217 0.50707911 0.6727465 -0.4251484 -0.98728378 -0.4012739
## [5,] -1.3516700 0.95926166 0.4919155 -1.3400220 -0.13288456 0.2197023
## [6,] -0.8703520 -1.74848438 -0.2026351 1.7068849 -1.51250049 1.1769298
## var25 var26 var27 var28 var29 var30
## [1,] -0.01587617 1.14549082 0.9417529 1.43715303 -0.9278330 0.9080937
## [2,] -0.15783099 -2.10469356 0.6408442 0.01236235 0.4811105 -2.0492817
## [3,] 0.29705289 0.40180292 1.9246469 0.31108274 0.8983029 -1.2089006
## [4,] -2.06615311 -0.55248745 -1.3790266 0.09518939 -0.3897106 1.7796744
## [5,] 0.78974655 -0.14745205 -0.8018895 0.77559612 1.1184595 -1.1970731
## [6,] 1.66830193 0.06319776 0.2907151 -0.73832848 -0.4088768 -1.0545311
head(BinomialExample$y)
## [1] 0 1 1 0 1 0
Use usual cross-validation procedure to find best lambda for lasso model.
cvfit <- cv.glmnet(BinomialExample$x, BinomialExample$y,
family = "binomial",
type.measure = "class")
plot(cvfit)
coef(cvfit, s="lambda.min")
## 31 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.228147565
## var1 .
## var2 0.435137211
## var3 -0.350315533
## var4 -0.884181276
## var5 -0.103249655
## var6 -0.660381219
## var7 .
## var8 -0.364938882
## var9 0.480723929
## var10 -1.028074339
## var11 -0.011111029
## var12 .
## var13 .
## var14 .
## var15 .
## var16 0.117021589
## var17 .
## var18 .
## var19 .
## var20 .
## var21 .
## var22 0.153471053
## var23 0.214900782
## var24 .
## var25 0.440629631
## var26 -0.251871772
## var27 -0.003211245
## var28 0.112238670
## var29 -0.128779957
## var30 .
If you want, use some tidyverse functions to extract and present the results.
# get some tidy results from cv.glmnet
tidied_cv <- tidy(cvfit)
glance_cv <- glance(cvfit)
tidied <- tidy(cvfit$glmnet.fit)
# and plot them (but not the intercept)
tidied %>%
filter(term != "(Intercept)") %>%
ggplot( aes(lambda, estimate, group = term, color = term)) +
scale_x_log10() +
geom_line() +
geom_vline(xintercept = glance_cv$lambda.min) # vertical line for min lambda
Since cv.glmnet
does not provide p-values for estimates, you can do a permutation test to get empiric p-values.
In this procedure, we will scramble the y vector a set number of times (say, 1000), do the cv.glmnet()
function with the scrambled outcome, and extract the estimates from the model.
Then, we will compare those estimates and calculate a p-value for each variable by summing the number of permuted estimates which are as or more extreme as the observed estimate, and dived by number of permutations.
Then we will adjust for multiple comparisons and present a final table of results.
# this function will take your input of x and y and return tidy results from the minimum lambda cv result
get_cv_result <- function(x, y){
cvfit <- cv.glmnet(x, y,
family = "binomial",
type.measure = "class")
tidied_cv <- tidy(cvfit)
glance_cv <- glance(cvfit)
tidied <- tidy(cvfit$glmnet.fit)
allvars <- data.frame(term = colnames(x))
tidied_min <- tidied %>%
filter(lambda == cvfit$lambda.min) %>%
right_join(allvars, by = "term") %>% #join to make sure you don't drop vars that went to zero in lasso estimation
select(term, estimate) %>%
mutate(term = factor(term, levels= str_sort(term, numeric=TRUE))) %>%
arrange(term) %>%
replace(is.na(.), 0)
return(tidied_min)
}
# get tidy result from observed data
observed_result <- get_cv_result(BinomialExample$x, BinomialExample$y)
# specify variable names and number of permutations
variable_names <- colnames(BinomialExample$x)
num_permutations = 100
# set seed for reproducibility
set.seed(1219)
#set up loop for permutation results
perm_results<-vector('list',num_permutations)
perm_results_bigger <- vector('list',num_permutations)
for(i in 1:num_permutations){
perm_y <- sample(BinomialExample$y)
res <- get_cv_result(BinomialExample$x, perm_y)
test <- left_join(observed_result, res,
by = "term",
suffix = c(".obs", ".perm")) %>%
# calculating if permuation estimate is greater than or equal to observed estimate
mutate(bigger = as.numeric(abs(estimate.perm) >= abs(estimate.obs)))
perm_results[[i]]<-res$estimate
perm_results_bigger[[i]] <- test$bigger
}
#make nice dataframe from results to present them
final_results <- bind_cols(perm_results_bigger) %>%
mutate(sum = rowSums(across(where(is.numeric))),
# calculate p-value which is sum of times that permuted val is >= observed value, divided by number of permutations
perm_pval = sum/num_permutations,
term = observed_result$term,
estimate = round(observed_result$estimate, 3)) %>%
select(term, estimate, perm_pval) %>%
# and if you want to adjust for multiple testing you can do FDR correction like this, just specify the method. Here I'm using Benjamini Hochberg
mutate(
qval = round(p.adjust(perm_pval, method = "BH"),2)
)
# use gt to display results in a table
gt::gt(final_results)
term | estimate | perm_pval | qval |
---|---|---|---|
var1 | 0.000 | 1.00 | 1.00 |
var2 | 0.496 | 0.01 | 0.04 |
var3 | -0.411 | 0.02 | 0.07 |
var4 | -0.953 | 0.00 | 0.00 |
var5 | -0.123 | 0.14 | 0.30 |
var6 | -0.739 | 0.00 | 0.00 |
var7 | 0.000 | 1.00 | 1.00 |
var8 | -0.412 | 0.01 | 0.04 |
var9 | 0.566 | 0.00 | 0.00 |
var10 | -1.137 | 0.00 | 0.00 |
var11 | -0.017 | 0.18 | 0.36 |
var12 | -0.006 | 0.24 | 0.40 |
var13 | 0.000 | 1.00 | 1.00 |
var14 | 0.000 | 1.00 | 1.00 |
var15 | 0.000 | 1.00 | 1.00 |
var16 | 0.185 | 0.08 | 0.22 |
var17 | 0.000 | 1.00 | 1.00 |
var18 | 0.000 | 1.00 | 1.00 |
var19 | 0.000 | 1.00 | 1.00 |
var20 | 0.000 | 1.00 | 1.00 |
var21 | 0.000 | 1.00 | 1.00 |
var22 | 0.168 | 0.13 | 0.30 |
var23 | 0.255 | 0.06 | 0.18 |
var24 | 0.000 | 1.00 | 1.00 |
var25 | 0.507 | 0.01 | 0.04 |
var26 | -0.276 | 0.04 | 0.13 |
var27 | -0.038 | 0.21 | 0.37 |
var28 | 0.150 | 0.19 | 0.36 |
var29 | -0.158 | 0.13 | 0.30 |
var30 | 0.000 | 1.00 | 1.00 |
df <- data.frame(t(bind_cols(perm_results)))
colnames(df) <- variable_names
Here we can display the permutation results from one of the variables that did not have a significant lasso coefficient. The green bars are a histogram of the estimates that came from the permutations. The vertical line is the observed coefficient estimate.
observed_val <- final_results %>% filter(term == "var1") %>% select(estimate)
ggplot(df, aes(x=var1)) +
geom_histogram(color="black", fill="green", alpha=.4) +
geom_vline(color="navy",lwd=1,lty=2,xintercept = observed_val$estimate) +
theme_classic()+
ggtitle("Lasso Coefficient Estimates for Var1 from \n 100 Permutations")
Here we can display the permutation results from one of the variables that DID have a significant lasso coefficient. We see the vertical line of observed value is far from the permuted results.
observed_val <- final_results %>% filter(term == "var4") %>% select(estimate)
ggplot(df, aes(x=var4)) +
geom_histogram(color="black", fill="green", alpha=.4) +
geom_vline(color="navy",lwd=1,lty=2,xintercept = observed_val$estimate) +
theme_classic()+
ggtitle("Lasso Coefficient Estimates for Var4 from \n 100 Permutations")
We discussed there was interest in plotting correlation matrix as a heatmap, here’s how you would do that with the data.
# following exmaple from http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization
library(reshape2)
cormat <- round(cor(BinomialExample$x),2)
head(cormat)
## var1 var2 var3 var4 var5 var6 var7 var8 var9 var10 var11 var12
## var1 1.00 -0.02 -0.13 0.11 -0.04 -0.11 0.11 0.04 -0.04 0.01 -0.09 -0.01
## var2 -0.02 1.00 -0.01 -0.16 -0.22 -0.02 -0.21 -0.14 -0.04 0.10 -0.04 0.21
## var3 -0.13 -0.01 1.00 0.00 0.19 -0.08 -0.12 0.06 0.16 0.09 -0.11 0.04
## var4 0.11 -0.16 0.00 1.00 0.19 0.15 -0.04 0.07 0.15 0.12 0.01 -0.10
## var5 -0.04 -0.22 0.19 0.19 1.00 0.14 -0.03 0.06 -0.01 0.13 -0.01 -0.06
## var6 -0.11 -0.02 -0.08 0.15 0.14 1.00 0.17 -0.08 0.13 -0.10 0.08 0.09
## var13 var14 var15 var16 var17 var18 var19 var20 var21 var22 var23 var24
## var1 0.02 0.21 -0.06 0.00 0.00 0.08 0.06 -0.17 -0.05 0.03 -0.02 0.10
## var2 -0.01 0.03 -0.01 -0.15 0.00 0.06 0.10 -0.07 0.02 0.05 -0.09 -0.11
## var3 0.17 0.03 -0.12 0.03 0.13 -0.05 -0.13 -0.16 0.08 -0.01 -0.21 0.06
## var4 -0.16 0.05 -0.02 0.13 0.00 -0.08 -0.13 0.19 0.03 -0.18 0.00 -0.17
## var5 0.21 0.17 0.10 0.20 0.03 0.00 -0.09 0.01 0.08 -0.19 0.01 -0.10
## var6 -0.02 0.15 0.01 0.01 -0.03 0.12 -0.06 0.17 0.00 0.07 0.01 0.00
## var25 var26 var27 var28 var29 var30
## var1 0.08 0.02 0.08 0.10 -0.09 -0.14
## var2 0.17 -0.21 -0.10 0.18 -0.15 0.02
## var3 -0.02 -0.03 0.09 -0.09 0.12 -0.13
## var4 -0.04 0.10 0.10 -0.08 -0.01 0.08
## var5 0.00 0.16 0.08 0.01 0.11 0.13
## var6 -0.02 0.15 0.12 -0.06 0.07 0.11
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
head(upper_tri)
## var1 var2 var3 var4 var5 var6 var7 var8 var9 var10 var11 var12
## var1 1 -0.02 -0.13 0.11 -0.04 -0.11 0.11 0.04 -0.04 0.01 -0.09 -0.01
## var2 NA 1.00 -0.01 -0.16 -0.22 -0.02 -0.21 -0.14 -0.04 0.10 -0.04 0.21
## var3 NA NA 1.00 0.00 0.19 -0.08 -0.12 0.06 0.16 0.09 -0.11 0.04
## var4 NA NA NA 1.00 0.19 0.15 -0.04 0.07 0.15 0.12 0.01 -0.10
## var5 NA NA NA NA 1.00 0.14 -0.03 0.06 -0.01 0.13 -0.01 -0.06
## var6 NA NA NA NA NA 1.00 0.17 -0.08 0.13 -0.10 0.08 0.09
## var13 var14 var15 var16 var17 var18 var19 var20 var21 var22 var23 var24
## var1 0.02 0.21 -0.06 0.00 0.00 0.08 0.06 -0.17 -0.05 0.03 -0.02 0.10
## var2 -0.01 0.03 -0.01 -0.15 0.00 0.06 0.10 -0.07 0.02 0.05 -0.09 -0.11
## var3 0.17 0.03 -0.12 0.03 0.13 -0.05 -0.13 -0.16 0.08 -0.01 -0.21 0.06
## var4 -0.16 0.05 -0.02 0.13 0.00 -0.08 -0.13 0.19 0.03 -0.18 0.00 -0.17
## var5 0.21 0.17 0.10 0.20 0.03 0.00 -0.09 0.01 0.08 -0.19 0.01 -0.10
## var6 -0.02 0.15 0.01 0.01 -0.03 0.12 -0.06 0.17 0.00 0.07 0.01 0.00
## var25 var26 var27 var28 var29 var30
## var1 0.08 0.02 0.08 0.10 -0.09 -0.14
## var2 0.17 -0.21 -0.10 0.18 -0.15 0.02
## var3 -0.02 -0.03 0.09 -0.09 0.12 -0.13
## var4 -0.04 0.10 0.10 -0.08 -0.01 0.08
## var5 0.00 0.16 0.08 0.01 0.11 0.13
## var6 -0.02 0.15 0.12 -0.06 0.07 0.11
melted_cormat <- melt(upper_tri, na.rm = TRUE)
head(melted_cormat)
## Var1 Var2 value
## 1 var1 var1 1.00
## 31 var1 var2 -0.02
## 32 var2 var2 1.00
## 61 var1 var3 -0.13
## 62 var2 var3 -0.01
## 63 var3 var3 1.00
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
I found the following helpful: https://bookdown.org/curleyjp0/psy317l_guides5/permutation-testing.html