## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(fig.width=6, fig.height=6, dpi=300,echo = FALSE) knitr::opts_chunk$set(fig.pos = "H", out.extra = "") ## ----table3, echo=FALSE, message=FALSE, warnings=FALSE, eval=FALSE------------ # tabl <- " # Method | true positives | false positives | Sensitivity | Specificity | FDR | AUC | # rank variables # RF impurity | 55 | 6 | 0.99 | 0.43 | 0.1 | 0.99 | # RF perm. | 61 | 3 | 0.99 | 0.48 | 0.05 | 0.74 | # RF corrected | 87 | 37 | 0.99 | 0.68 | 0.3 | 0.99 | # cforest | 61 | 3 | 0.99 | 0.84 | 0.12 | 0.99 | # SobolMDA | 61 | 3 | 0.99 | 0.48 | 0.05 | 0.99 | # Shaff | 1 | 0 | | | | | # iml Shapley | - | - | - | - | - | - | # fastshap | - | - | - | - | - | - | # # select variables based on a RF impurity measure # RFlocalfdr | 59 | 36 | 0.99 | 0.46 | 0.34 | 0.73 | # Boruta | 2 | 2 | 0.99 | 0.016 | 0.5 | 0.51 | # AIR | 127 | 273 | 0.96 | 1 | 0.68 | 0.98 | # AIR+locfdr | 123 | 186 | 0.97 | 0.97 | 0.61 | 0.97 | # PIMP | 39 | 556 | 0.91 | 0.31 | 0.58 | 0.98 | # RFE | 22 | 2 | 0.99 | 0.17 | 0.08 | 0.58 | # " # # kable(tabl, format = "markdown") #disasterous # ## ----echo=FALSE, eval=FALSE, result='asis',fig.cap='...'---------------------- # # cat(tabl) # output the table in a format good for HTML/PDF/docx conversion # ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # library(ranger) # library(RFlocalfdr) # library(caret) # library(pROC) # ## ----eval=FALSE,echo=FALSE---------------------------------------------------- # packageDescription("RFlocalfdr")$GithubSHA1 # #source("/home/dun280/Dropbox/R_libraries/RF_localfdr/RFlocalfdr/R/my.pimp.s") # ## ----eval=FALSE,echo=FALSE---------------------------------------------------- # #just plot a small data set to show the structure # set.seed(13) # num_samples <- 300 # num_bands <- 4 # band_rank <- 6 # num_vars <- num_bands * (2 ** (band_rank+1) -1) # print(num_vars) # # X <- matrix(NA, num_samples , num_vars) # set.seed(123) # # temp<-matrix(0,508,3) # var_index <- 1 # for(band in 1:num_bands) { # for (rank in 0:band_rank) { # for (i in 1:2**rank) { # temp[var_index,]<-c(band , rank, var_index) # var_index <- var_index +1 # } # } # } # # #png("./supp_figures/small_simulated_data_set.png") # plot(temp[,1],ylim=c(0,10),type="p") # lines(temp[,2],type="b",col="red") # legend(0,10,c("band","rank"),pch=1,col=c(1,2)) # #dev.off() # abline(v=c( 1, 2 , 4 , 8 ,16 ,32, 64 )) # # table(temp[temp[,1]==1,2]) # ## # 0 1 2 3 4 5 6 # ## # 1 2 4 8 16 32 64 # # # X <- matrix(NA, num_samples , num_vars) # set.seed(123) # # var_index <- 1 # for(band in 1:num_bands) { # for (rank in 0:band_rank) { # # cat("rank=",rank,"\n") # var <- sample(0:2, num_samples, replace=TRUE) # for (i in 1:2**rank) { # X[,var_index] <- var # var_index <- var_index +1 # } # # print(X[1:2,1:140]) # # system("sleep 3") # } # } # # y <- as.numeric(X[,1] > 1 | X[,2] > 1 | X[,4] > 1 | X[,8] > 1 | X[,16] > 1 | # X[,32] > 1 | X[,64] > 1 ) # # # # data <- cbind(y,X) # colnames(data) <- c("y",make.names(1:num_vars)) # rfModel <- ranger(data=data,dependent.variable.name="y", importance="impurity", # classification=TRUE, mtry=100,num.trees = 10000, replace=TRUE, # seed = 17 ) # zz2 <-log(importance(rfModel)) # # plot(zz2,type="b",col=temp[,1]+1) # # roccurve <- roc(c(rep(1,127),rep(0,508-127)),zz2) # plot(roccurve) # auc(roccurve) # 0.993 # ## ----simulation2, echo=FALSE, fig.cap="A small simulated data set. Each band contains blocks of size {1, 2, 4, 8, 16, 32, 64}, and each block consists of correlated (identical variables).", fig.align="center", out.width = '50%'---- knitr::include_graphics("./supp_figures/small_simulated_data_set.png") ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # # generate the data # set.seed(13) # num_samples <- 300 # num_bands <- 50 # band_rank <- 6 # num_vars <- num_bands * (2 ** (band_rank+1) -1) # print(num_vars) # # X <- matrix(NA, num_samples , num_vars) # set.seed(123) # # var_index <- 1 # for(band in 1:num_bands) { # for (rank in 0:band_rank) { # # cat("rank=",rank,"\n") # var <- sample(0:2, num_samples, replace=TRUE) # for (i in 1:2**rank) { # X[,var_index] <- var # var_index <- var_index +1 # } # # print(X[1:2,1:140]) # # system("sleep 3") # } # } # # y <- as.numeric(X[,1] > 1 | X[,2] > 1 | X[,4] > 1 | X[,8] > 1 | X[,16] > 1 | # X[,32] > 1 | X[,64] > 1 ) # # ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # data <- cbind(y,X) # colnames(data) <- c("y",make.names(1:num_vars)) # # rfModel <- ranger(data=data,dependent.variable.name="y", importance="impurity", # classification=TRUE,num.trees = 10000, replace=FALSE, mtry=100, seed = 17 ) # # # imp <-log(ranger::importance(rfModel)) # t2 <-count_variables(rfModel) # plot(density(imp)) # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),imp) # plot(roccurve) # auc(roccurve) # 0.993 # # palette("default") # col<-c(1, rep(2,2), rep(3,4), rep(4, 8), rep(5,16 ), rep(6,32 ), rep(7,64) ) # plot(1:1016,imp[1:1016],type="p",col=rep(col,8),pch=16,cex=0.8, # xlab="variable number",ylab="log importances") # plot(imp,type="p",col=rep(col,8),pch=16,cex=0.8,xlab="variable number", # ylab="log importances") # # # predictions <- imp # labels <- c(rep(1,127),rep(0,6350-127)) # pred <- prediction(predictions, labels) # perf <- performance(pred, "tpr", "fpr") # plot(perf) # cost_perf = performance(pred, "cost") # pred@cutoffs[[1]][which.min(cost_perf@y.values[[1]])] # # X22 # #-3.485894 # # predicted_values <-rep(0,6350);predicted_values[ which(imp> -3.485894) ]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # #predicted_values 0 1 # # 0 6217 72 # # 1 6 55 # conf_matrix[2,1]/sum(conf_matrix[2,]) #[1]] 0.09836066 FDR # sensitivity(conf_matrix) #] 0.9990358 TP/(TP+FN) # specificity(conf_matrix) # 0.4330709 TN/(FP+TN) # # ## ----simulation2zz2, echo=FALSE, fig.cap="The log importances for the first 8 bands.", fig.align="center", out.width = '50%'---- knitr::include_graphics("./supp_figures/simulation2_zz2.png") ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # cutoffs <- c(0,1,4,10) # #png("./supp_figures/simulated_data_determine_cutoff.png") # res.con<- determine_cutoff(imp,t2,cutoff=cutoffs,plot=c(0,1,4,10)) # #dev.off() # ## ----echo=FALSE, fig.cap="For this data set, the selected cutoff value is 0.", fig.align="center", out.width = '50%'---- knitr::include_graphics("./supp_figures/simulated_data_determine_cutoff.png") ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # plot(cutoffs,res.con[,3]) # cutoffs[which.min(res.con[,3])] # ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # temp<-imp[t2 > 0] # palette("R3") # # qq <- plotQ(temp,debug.flag = 1) # ppp<-run.it.importances(qq,temp,debug=0) # aa<-significant.genes(ppp,temp,cutoff=0.05,do.plot=2) # length(aa$probabilities) # 95 # # tt1 <-as.numeric(gsub("X([0-9]*)","\\1",names(aa$probabilities))) # tt2 <- table(ifelse((tt1 < 127),1,2)) # # 1 2 # # 59 36 # # The false discovery rate is 0.3789474 # tt2[2]/(tt2[1]+tt2[2]) # #59 36 36/(36+59) = 0.3789474 # # predicted_values<-rep(0, 6350);predicted_values[tt1]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # conf_matrix[2,1]/sum(conf_matrix[2,]) # 0.3789474 FDR # sensitivity(conf_matrix) #0.994215 TP/(TP+FN) # specificity(conf_matrix) #0.4645669 TN/(FP+TN) # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values) # plot(roccurve) # auc(roccurve) #0.7294 # # accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix)) # accuracy # 0.983622 # ## ----eval=FALSE,echo=FALSE---------------------------------------------------- # temp <- temp - min(temp) + .Machine$double.eps # # palette(topo.colors(n = 7)) # col<-c(1, rep(2,2), rep(3,4), rep(4, 8), rep(5,16 ), rep(6,32 ), rep(7,64) ) # # plot(1:127,temp[1:127],type="p",col=rep(col,2),pch=16,cex=0.8, # xlab="variable number",ylab="log importances") # # # plot(1:1016,temp[1:1016],type="p",col=rep(col,8),pch=16,cex=0.8, # xlab="variable number",ylab="log importances") # lines(1:1016,temp[1:1016],col = "gray62",lwd=0.5) # # abline(h=3.699622,col="red") # abline(v=127,col="green") # legend("topright",legend=c("RFlocalfdr cutoff","non-null variables"),lty=1,col=c("red","green" )) # ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # install.packages("Boruta") # install.packages("locfdr") # install.packages("vita") # install.packages("locfdr") # devtools::install_github("silkeszy/Pomona") # ## ----eval=FALSE,echo=FALSE---------------------------------------------------- # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install("twilight") # ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # library(Boruta) # set.seed(120) # system.time(boruta1 <- Boruta(X,as.factor(y), num.threads = 7,getImp=getImpRfGini, # classification=TRUE,num.trees = 10000, replace=FALSE, mtry=100, seed = 17)) # print(boruta1) # #Boruta performed 99 iterations in 19.54727 secs. # #4 attributes confirmed important: X4859, X58, X6132, X7; # # 6346 attributes confirmed unimportant: X1, X10, X100, X1000, X1001 and 6341 more; # plotImpHistory(boruta1) # aa <- which(boruta1$finalDecision=="Confirmed") # bb <- which(boruta1$finalDecision=="Tentative") # predicted_values <-rep(0,6350);predicted_values[c(aa,bb)]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # # conf_matrix[2,1]/sum(conf_matrix[2,]) # 0.3789474 FDR # sensitivity(conf_matrix) #0.9996786 TP/(TP+FN) # specificity(conf_matrix) #0.01574803 TN/(FP+TN) # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values) # plot(roccurve) # auc(roccurve) #0.5077 # # accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix)) # accuracy #0.98 # ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # # library(Pomona) # colnames(X) <- make.names(1:dim(X)[2]) # set.seed(111) # res <- var.sel.rfe(X, y, prop.rm = 0.2, recalculate = TRUE, tol = 10, # ntree = 500, mtry.prop = 0.2, nodesize.prop = 0.1, no.threads = 7, # method = "ranger", type = "classification", importance = "impurity", # case.weights = NULL) # res$var # #[1] "X1" "X106" "X11" "X12" "X127" "X13" "X15" "X16" "X2" # #[10] "X23" "X24" "X3" "X4" "X44" "X46" "X4885" "X5" "X54" # #[19] "X5474" "X6" "X7" "X72" "X9" "X91" # tt <-as.numeric(gsub("X([0-9]*)","\\1", res$var)) # table(ifelse((tt < 127),1,2)) # # 1 2 # #21 3 0.0833 # # res<-c(1,106, 11, 12, 127, 13, 15, 16, 2, 23, 24, 3, 4, 44, 46, 4885, 5, 54, 5474, 6, # 7, 72, 9, 91) # predicted_values <-rep(0,6350);predicted_values[c(res)]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # #predicted_values 0 1 # # 0 6221 105 # # 1 2 22 # # conf_matrix[2,1]/sum(conf_matrix[2,]) # 0.083 FDR # sensitivity(conf_matrix) # 0.9996786 TP/(TP+FN) # specificity(conf_matrix) #0 0.1732283 TN/(FP+TN) # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values) # plot(roccurve) # auc(roccurve) #0.5865 # accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix)) # accuracy # 0.9831496 # ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # # rfModel2 <- ranger(data=data,dependent.variable.name="y", importance="impurity_corrected", # classification=TRUE, mtry=100,num.trees = 10000, replace=TRUE, seed = 17 ) # ww<- importance_pvalues( rfModel2, method = "janitza") # # p <- ww[,"pvalue"] # cc <- which(p< 0.05) # predicted_values <-rep(0,6350);predicted_values[cc]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # #predicted_values 0 1 # # 0 5950 0 # # 1 273 127 # #FDR is 273/(127+273) = 0.6825 # # sensitivity(conf_matrix) #0.9561305 TP/(TP+FN) # specificity(conf_matrix) #1 TN/(FP+TN) # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values) # plot(roccurve) # auc(roccurve) # 0.9781 # accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix)) # accuracy # 0.9570079 # # # # ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # plot(density(ww[,"importance"])) # imp <- ww[,"importance"] # #imp <-imp/sqrt(var(imp)) # #plot(density(imp)) # library(locfdr) # # aa<-locfdr(imp,df=13) # which( (aa$fdr< 0.05) & (imp>0)) # cc2 <- which( (aa$fdr< 0.05) & (imp>0)) # length(cc2) # [1] 309 # length(intersect(cc,cc2)) #[1] 309 # # (length(cc2) - length(which(cc2 <= 127)))/length(cc2) #[1] 0.6019417 fdr # predicted_values <-rep(0,6350);predicted_values[cc2]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # conf_matrix[2,1]/sum(conf_matrix[2,]) # 0.6019417 FDR # sensitivity(conf_matrix) # 0.9701109 TP/(TP+FN) # specificity(conf_matrix) #0.9685039 TN/(FP+TN) # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values) # plot(roccurve) # auc(roccurve) # 0.9693 # accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix)) # accuracy # 0.9700787 # ## ----eval=FALSE,echo=TRUE----------------------------------------------------- # #vita: Variable Importance Testing Approaches # library(vita) # y<-factor(y) # X<-data.frame(X) # set.seed(117) # cl.ranger <- ranger(y~. , X,mtry = 100,num.trees = 10000, classification=TRUE, replace=FALSE, # importance = 'impurity') # system.time(pimp.varImp.cl<-my_ranger_PIMP(X,y,cl.ranger,S=10, parallel=TRUE, ncores=10)) # pimp.t.cl <- vita::PimpTest(pimp.varImp.cl,para = FALSE) # aa <- summary(pimp.t.cl,pless = 0.1) # # tt<-as.numeric(gsub("X([0-9]*)","\\1", names(which(aa$cmat2[,"p-value"]< 0.05)))) # table(ifelse((tt < 127),1,2)) # # 1 2 # # 126 176 176 /( 126+ 176 ) = 0.582 # # predicted_values <-rep(0,6350);predicted_values[which(aa$cmat2[,"p-value"]< 0.05)]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # conf_matrix[2,1]/sum(conf_matrix[2,]) #[1] 0.5794 FDR # sensitivity(conf_matrix) #0.971 TP/(TP+FN) # specificity(conf_matrix) #1 TN/(FP+TN) # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values) # plot(roccurve) # auc(roccurve) # 0.9859 # accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix)) # accuracy # 0.9724409 # ## ----eval=FALSE,echo=FALSE---------------------------------------------------- # # twilight uses a stochastic downhill search algorithm for estimating the local false discovery rate # library(twilight) # p.values <- aa$cmat2[,"p-value"] # ans <- twilight(p.values) #Twilight cannot run properly. # fdr <- ans$result$fdr # sum(fdr < 0.05) #[1] 0 # ## ----eval=FALSE,echo=FALSE---------------------------------------------------- # #how to make a neat html table in Rmarkdown # library(tables) # X <- rnorm(125, sd=100) # Group <- factor(sample(letters[1:5], 125, rep=TRUE)) # tab <- tabular( Group ~ (N=1)+Format(digits=2)*X*((Mean=mean) + Heading("Std Dev")*sd) ) # table_options(knit_print = FALSE) # tab # This chunk uses the default results = 'markup' # # toHTML(tab) # This chunk uses results = 'asis' # table_options(CSS = # "", doCSS = TRUE) # tab # # table_options(doCSS = FALSE) # ## ----echo=TRUE, eval=FALSE---------------------------------------------------- # rfModel2 <- ranger(data=data,dependent.variable.name="y", importance="permutation", # classification=TRUE, mtry=100,num.trees = 10000, replace=FALSE, seed = 17 ) # imp <-ranger::importance(rfModel2) # plot(density(imp)) # # palette(topo.colors(n = 7)) # plot(1:1016,imp[1:1016],type="p",col=rep(col,2),pch=16,cex=0.8, # xlab="variable number",ylab="log importances") # lines(1:1016,imp[1:1016],col = "gray62",lwd=0.5) # abline(v=127,col="green") # # # could apply Briemans and Cutlers argument that the permutation importance is Gaussian -- # # or could use empirical Bayes # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),imp) # plot(roccurve) # auc(roccurve) # 0.9924 # # # library(ROCR) # predictions <- imp # labels <- c(rep(1,127),rep(0,6350-127)) # pred <- prediction(predictions, labels) # cost_perf = performance(pred, "cost") # pred@cutoffs[[1]][which.min(cost_perf@y.values[[1]])] # # table(imp> 0.0001063063 ,c(rep(1,127),rep(0,6350-127))) # # 0 1 # # FALSE 6220 66 # # TRUE 3 61 # # predicted_values <-rep(0,6350);predicted_values[which(imp> 0.0001063063 )]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # #predicted_values 0 1 # # 0 6220 66 # # 1 3 61 # conf_matrix[2,1]/sum(conf_matrix[2,]) #[1] 0.046875 FDR # sensitivity(conf_matrix) # 0.9995179 TP/(TP+FN) # specificity(conf_matrix) # 0.480315 TN/(FP+TN) # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values) # plot(roccurve) # auc(roccurve) # 0.7399 # accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix)) # accuracy # 0.9724409 # # # # ## ----echo=TRUE, eval=FALSE---------------------------------------------------- # rfModel3 <- ranger(data=data,dependent.variable.name="y", importance="impurity_corrected", # classification=TRUE, mtry=100,num.trees = 10000, replace=FALSE, seed = 17 ) # imp <-ranger::importance(rfModel3) # plot(density(imp)) # # palette(topo.colors(n = 7)) # plot(1:1016,imp[1:1016],type="p",col=col,pch=16,cex=0.8, # xlab="variable number",ylab="log importances") # lines(1:1016,imp[1:1016],col = "gray62",lwd=0.5) # abline(v=127,col="green") # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),imp) # plot(roccurve) # auc(roccurve) # 0.9951 # # library(ROCR) # predictions <- imp # labels <- c(rep(1,127),rep(0,6350-127)) # pred <- prediction(predictions, labels) # cost_perf = performance(pred, "cost") # pred@cutoffs[[1]][which.min(cost_perf@y.values[[1]])] # # table(imp> 0.0106588 ,c(rep(1,127),rep(0,6350-127))) # # 0 1 # # FALSE 6186 40 # # TRUE 37 87 # #best FDR is # 37/(37 + 87) #[1] ] 0.2983871 # # # predicted_values <-rep(0,6350);predicted_values[ which(imp> 0.0106588)]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # conf_matrix[2,1]/sum(conf_matrix[2,]) #[1]] 0.2983871 FDR # sensitivity(conf_matrix) #0.9940543 TP/(TP+FN) # specificity(conf_matrix) #0.6850394 TN/(FP+TN) # # # ## ----echo=TRUE, eval=FALSE---------------------------------------------------- # library(party) # library(permimp) # # data <- data.frame(y,X) # # system.time(mod1.cf <- party::cforest(y ~ ., data = data, # control = party::cforest_unbiased(ntree = 10, mtry = 100))) # # 12.848 0.000 12.885 # system.time(aa<-party::varimp(mod1.cf, conditional = TRUE)) # # user system elapsed # #3089.301 9.484 3100.554 # system.time(aa3<-permimp(mod1.cf, conditional = TRUE)) # # user system elapsed # # 4.685 0.000 4.707 # # system.time(mod1.cf <- party::cforest(y ~ ., data = data, # control = party::cforest_unbiased(ntree = 100, mtry = 100))) # # 27.233 0.464 27.703 # system.time(aa<-party::varimp(mod1.cf, conditional = TRUE)) # # user system elapsed # #29380.980 81.511 29475.138 # save(aa,file="aa.Rdata") # system.time(aa3<-permimp(mod1.cf, conditional = FALSE)) # # user system elapsed # # 6.084 0.000 6.089 # # # system.time(mod1.cf <- party::cforest(y ~ ., data = data, # ontrol = party::cforest_unbiased(ntree = 1000, mtry = 100))) # #35.438 0.636 36.095 # system.time(aa4<-permimp(mod1.cf, conditional = FALSE)) # # user system elapsed # # 47.135 0.429 47.578 # # palette("default") # col<-c(1, rep(2,2), rep(3,4), rep(4, 8), rep(5,16 ), rep(6,32 ), rep(7,64) ) # plot(1:1016,aa4$values[1:1016],type="p",col=col,pch=16,cex=0.8, # xlab="variable number",ylab="log importances") # # # # system.time(mod1.cf <- party::cforest(y ~ ., data = data, # control = party::cforest_unbiased(ntree = 10000, replace=FALSE, mtry = 100))) # #35.438 0.636 36.095 # system.time(aa4<-permimp(mod1.cf, conditional = FALSE)) # # user system elapsed # # 47.135 0.429 47.578 # # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),aa4$values) # plot(roccurve) # auc(roccurve) #0.9486 # # system.time(mod1.cf <- party::cforest(y ~ ., data = data, # control = party::cforest_unbiased(ntree = 10000, mtry = 100))) # #Process R killed at Mon Feb 26 07:18:23 2024 on robubuntu # # user system elapsed # # 98.673 2.832 101.977 on petra, seems to have used 26GB of RAM # # system.time(aa4<-permimp(mod1.cf, conditional = TRUE)) # # user system elapsed # #352.681 2.262 356.232 # head(aa4$values) # # X1 X2 X3 X4 X5 X6 # #2.600188e-05 9.454988e-05 1.217855e-04 1.682538e-04 1.572708e-04 1.706311e-04 # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),aa4$values) # plot(roccurve) # auc(roccurve) # 0.9978 # # # library(ROCR) # predictions <- aa4$values # labels <- c(rep(1,127),rep(0,6350-127)) # pred <- prediction(predictions, labels) # perf <- performance(pred, "tpr", "fpr") # plot(perf) # # cost_perf <- performance(pred, "cost") # pred@cutoffs[[1]][which.min(cost_perf@y.values[[1]])] # # table(predictions> 3.264138e-05 ,c(rep(1,127),rep(0,6350-127))) # # 0 1 # # FALSE 6220 66 # # TRUE 3 61 # predicted_values <-rep(0,6350);predicted_values[ which(predictions> 3.264138e-05 )]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # # conf_matrix[2,1]/sum(conf_matrix[2,]) #[1]] 0.1229508 FDR # sensitivity(conf_matrix) # 0.9975896 TP/(TP+FN) # specificity(conf_matrix) # 0.8425197 TN/(FP+TN) # # #https://stats.stackexchange.com/questions/61521/cut-off-point-in-a-roc-curve-is-there-a-simple-function # library(pROC) # rocobj <- roc(labels,predictions) # plot(rocobj) # coords(rocobj, "best") # coords(rocobj, x="best", input="threshold", best.method="youden") # Sa # auc(rocobj) # 0.9978 # # predicted_values <-rep(0,6350);predicted_values[ which(predictions> 8.976665e-06 )]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # # #see also # #https://stats.stackexchange.com/questions/132547/roc-curves-for-unbalanced-datasets # #https://cran.r-project.org/web/packages/qeML/vignettes/Unbalanced_Classes.html # #https://juandelacalle.medium.com/how-and-why-i-switched-from-the-roc-curve-to-the-precision-recall-curve-to-analyze-my-imbalanced-6171da91c6b8 # #https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-imbalanced-classification/ # ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # #library(remotes) # #install_gitlab("drti/sobolmda") # library(sobolMDA) # data <- data.frame(y,X) # system.time(rng.sobolmda <- sobolMDA::ranger(dependent.variable.name = "y", classification = TRUE, data = data, replace=FALSE, mtry = 100, # num.trees = 10000, importance = "sobolMDA", seed = 125)) # # palette("default") # col<-c(1, rep(2,2), rep(3,4), rep(4, 8), rep(5,16 ), rep(6,32 ), rep(7,64) ) # plot(1:1016,rng.sobolmda$variable.importance[1:1016],type="p",col=col,pch=16,cex=0.8, # xlab="variable number",ylab="log importances") # plot(rng.sobolmda$variable.importance,type="p",col=col,pch=16,cex=0.8, # xlab="variable number",ylab="log importances") # # imp<- rng.sobolmda$variable.importance # # roccurve <- roc(c(rep(1,127),rep(0,6350-127)),imp) # plot(roccurve) # auc(roccurve) #0.9954 # # library(ROCR) # predictions <- rng.sobolmda$variable.importance # labels <- c(rep(1,127),rep(0,6350-127)) # pred <- prediction(predictions, labels) # perf <- performance(pred, "tpr", "fpr") # plot(perf) # # #perf <- performance(pred, "acc") # #plot(perf, avg= "vertical", spread.estimate="boxplot", lwd=3,col='blue', # # show.spread.at= seq(0.1, 0.9, by=0.1), # # main= "Accuracy across the range of possible cutoffs") # # #plot(0,0,type="n", xlim= c(0,0.001), ylim=c(0,10000), # # xlab="Cutoff", ylab="Density", # # main="How well do the predictions separate the classes?") # # # #lines(density(pred@predictions[[1]][pred@labels[[1]]=="1"]), col= "red") # #lines(density(pred@predictions[[1]][pred@labels[[1]]=="0"]), col="green") # # cost_perf = performance(pred, "cost") # pred@cutoffs[[1]][which.min(cost_perf@y.values[[1]])] # # table(rng.sobolmda$variable.importance> 0.001048671 ,c(rep(1,127),rep(0,6350-127))) # # 0 1 # # FALSE 6220 66 # # TRUE 3 61 # predicted_values <-rep(0,6350);predicted_values[ which(imp> 0.001048671 )]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127))) # conf_matrix # #predicted_values 0 1 # # 0 6220 66 # # 1 3 61 # # conf_matrix[2,1]/sum(conf_matrix[2,]) #[1]] 0.046875 FDR # sensitivity(conf_matrix) # 0.9995179 TP/(TP+FN) # specificity(conf_matrix) # 0.480315 TN/(FP+TN) # # #best FDR is # 3/(61+3) #[1] 0.046875 # ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # # example from Variable Importance in Random Forests: Traditional Methods and New Developments # #https://towardsdatascience.com/variable-importance-in-random-forests-20c6690e44e0 # # library(kernlab) # library(drf) #An implementation of distributional random forests as introduced in Cevid & Michel & Meinshausen & Buhlmann (2020) # library(Matrix) # library(DescTools) # library(mice) # library(sobolMDA) # source("./simulated2/compute_drf_vimp.R") # source("./simulated2/evaluation.R") # # # ##Simulate Data that experiences both a mean as well as sd shift # n <- 200 # # Simulate from X # x1 <- runif(n,-1,1) # x2 <- runif(n,-1,1) # X0 <- matrix(runif(7*n,-1,1), nrow=n, ncol=7) # x3 <- x1+ runif(n,-1,1) # X <- cbind(x1,x2, x3, X0) # # # Simulate dependent variable Y # Y <- as.matrix(rnorm(n,mean = 0.8*(x1 > 0), sd = 1 + 1*(x2 > 0))) # colnames(X)<-paste0("X", 1:10) # # head(cbind(Y,X)) # # # Xfull <-X # ## Variable importance for conditional Expectation Estimation # XY <- as.data.frame(cbind(Xfull, Y)) # colnames(XY) <- c(paste('X', 1:(ncol(XY)-1), sep=''), 'Y') # num.trees <- 500 # forest <- sobolMDA::ranger(Y ~., data = XY, num.trees = num.trees, importance = 'sobolMDA') #was this a classification model? check # sobolMDA <- forest$variable.importance # names(sobolMDA) <- colnames(X) # # sort(sobolMDA, decreasing = T) # # # forest <- sobolMDA::ranger(Y ~., data = XY, num.trees = num.trees, importance = 'permutation') # MDA <- forest$variable.importance # names(MDA) <- colnames(X) # sort(MDA, decreasing = T) # # MMDVimp <- compute_drf_vimp(X=X,Y=Y) # sort(MMDVimp, decreasing = T) # # # load("~/Downloads/wage_benchmark.Rdata") # ##Define the training data # # n<-1000 # # Xtrain<-X[1:n,] # Ytrain<-Y[1:n,] # Xtrain<-cbind(Xtrain,Ytrain[,"male"]) # colnames(Xtrain)[ncol(Xtrain)]<-"male" # Ytrain<-Ytrain[,1, drop=F] # # # ##Define the test data # ntest<-2000 # Xtest<-X[(n+1):(n+ntest),] # Ytest<-Y[(n+1):(n+ntest),] # Xtest<-cbind(Xtest,Ytest[,"male"]) # colnames(Xtest)[ncol(Xtest)]<-"male" # Ytest<-Ytest[,1, drop=F] # # # Calculate variable importance for both measures # # 1. Sobol-MDA # XY <- as.data.frame(cbind(Xtrain, Ytrain)) # colnames(XY) <- c(paste('X', 1:(ncol(XY)-1), sep=''), 'Y') # num.trees <- 500 # forest <- sobolMDA::ranger(Y ~., data = XY, num.trees = num.trees, importance = 'sobolMDA') # SobolMDA <- forest$variable.importance # names(SobolMDA) <- colnames(Xtrain) # # # 2. MMD-MDA # MMDVimp <- compute_drf_vimp(X=Xtrain,Y=Ytrain,silent=T) # # # # print("Top 10 most important variables for conditional Expectation estimation") # sort(SobolMDA, decreasing = T)[1:10] # print("Top 5 most important variables for conditional Distribution estimation") # sort(MMDVimp, decreasing = T)[1:10] # # # Remove variables one-by-one accoring to the importance values saved in SobolMDA # # and MMDVimp. # evallistSobol<-evalall(SobolMDA, X=Xtrain ,Y=Ytrain ,Xtest, Ytest, metrics=c("MSE"), num.trees ) # evallistMMD<-evalall(MMDVimp, X=Xtrain ,Y=Ytrain ,Xtest, Ytest, metrics=c("MMD"), num.trees ) #slow # # plot(1:79,evallistSobol$evalMSE, type="l", lwd=2, cex=0.8, col="darkgreen", main="MSE and (MMD+0.8) loss" , xlab="Number of Variables removed", ylab="Values") # lines(1:79,evallistMMD$evalMMD+0.8, type="l", lwd=2, cex=0.8, col="blue") # # #Random Forests in 2023: Modern Extensions of a Powerful Method # #https://towardsdatascience.com/random-forests-in-2023-modern-extensions-of-a-powerful-method-b62debaf1d62 # ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # # devtools::install_gitlab("drti/shaff") # library(shaff) # library("nnls") #needs this but does not load it # data <- cbind(y,X) # names(data)[1] <- "Y" # system.time(rng <- shaff(data = data, mtry = 100, num.trees = 10000, K=500)) # which(rng>0) # #[1] 17 # ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # library(iml) # library(ranger) # set.seed(314) # data <- data.frame(y,X) # rfModel2 <- ranger(data=data,dependent.variable.name="y", importance="permutation", # classification=TRUE,mtry=50,num.trees = 1000, replace=TRUE, seed = 17 ) # X <- data.frame(data[,-which(names(data) == "y")]) # mod <- Predictor$new(rfModel2, data = X, type = "response") # shapley <- Shapley$new(mod, x.interest = X[1, ]) # plot(shapley) # # #full data set killed -- using a lot of RAM # # dim(X) [1] 300 508 OK # # # shapley$plot(sort = TRUE) # shapley$y.hat.interest # # phis <- shapley$results$phi # names(phis) <- shapley$results$feature # par(mar=c(5, 10, 4, 2)+0.1) # for wide enough left margin # barplot(sort(phis), horiz=TRUE, las=1) # # ## pick features e.g. by a limit for the absolute value of phis # # values <- shapley$results # # N <- 10 # number of features to show # # Capture the ggplot2 object # p <- plot(shapley, sort = TRUE) # # Modify it so it shows only top N features # print(p + scale_x_discrete(limits=rev(p$data$feature.value[order(-p$data$phi)][1:N]))) # # # dim(X) [1] 300 508 OK # library(iml) # library(ranger) # set.seed(314) # data <- data.frame(y,X) # rfModel2 <- ranger(data=data,dependent.variable.name="y", importance="permutation", # classification=TRUE,mtry=50,num.trees = 1000, replace=TRUE, seed = 17 ) # X <- data.frame(data[,-which(names(data) == "y")]) # mod <- Predictor$new(rfModel2, data = X, type = "response") # shapley <- Shapley$new(mod, x.interest = X[1, ]) # plot(shapley) # ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # #Source: vignettes/fastshap.Rmd # # library(fastshap) # library(ranger) # set.seed(2053) # for reproducibility # # pfun <- function(object, newdata) { # predict(object, data = newdata)$predictions # } # # library(doParallel) # # # With parallelism # #salloc --account=OD-225217 --mem=100GB --nodes=1 --ntasks-per-node=1 --cpus-per-task=20 -J interactive -t 6:00:00 srun --pty /bin/bash -l # # registerDoParallel(cores = 20) # use forking # set.seed(5038) # system.time({ # estimate run time # ex.ames.par <- explain(rfModel2, X = X, pred_wrapper = pfun, nsim = 50, adjust = TRUE, parallel = TRUE) # }) # # # ######## a test data set from the fastshap documentation # # Load the sample data; see ?datasets::mtcars for details # data(mtcars) # # # Fit a projection pursuit regression model # fit <- ppr(mpg ~ ., data = mtcars, nterms = 5) # # Prediction wrapper # pfun <- function(object, newdata) { # needs to return a numeric vector # predict(object, newdata = newdata) # } # # # Compute approximate Shapley values using 10 Monte Carlo simulations # set.seed(101) # for reproducibility # shap1 <- explain(fit, X = subset(mtcars, select = -mpg), nsim = 10, pred_wrapper = pfun) # head(shap1) # apply(shap1,2,f<-function(x){mean(abs(x))}) # # shap <- explain(fit, X = subset(mtcars, select = -mpg), nsim = 10, pred_wrapper = pfun,adjust=TRUE) # # # x <- iris[which(iris[,5] != "setosa"), c(1,5)] # trials <- 10000 # ptime <- system.time({ # r <- foreach(icount(trials), .combine=cbind) %dopar% { # ind <- sample(100, 100, replace=TRUE) # result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit)) # coefficients(result1) # } # })[3] # ptime # # ############################################################################################ # ######## try the small data set # #salloc --account=OD-225217 --mem=100GB --nodes=1 --ntasks-per-node=1 --cpus-per-task=20 -J interactive -t 6:00:00 srun --pty /bin/bash -l # dim(data) #[1] 300 509 # rfModel2 <- ranger(data=data,dependent.variable.name="y", importance="none", # classification=TRUE,mtry=50,num.trees = 1000, replace=TRUE, seed = 17 ) # # pfun <- function(object, newdata) { # needs to return a numeric vector # predict(object, data = newdata)$predictions # } # system.time(shap <- fastshap::explain(rfModel2, X = subset(data, select = -y), nsim = 10, pred_wrapper = pfun)) # # user system elapsed # #201.161 31.122 153.928 # head(shap) # plot(apply(shap,2,f<-function(x){mean(abs(x))})) # # # # library(doParallel) # # registerDoParallel(cores = 20) # use forking # set.seed(5038) # system.time({ # estimate run time # shap <- fastshap::explain(rfModel2, X = subset(data, select = -y), nsim = 1000, pred_wrapper = pfun, adjust = TRUE, parallel = TRUE) # }) # # user system elapsed # #23285.927 5089.726 3168.814 # # # # predictions <- SHAP.global <- apply(shap,2,f<-function(x){mean(abs(x))}) # # palette(topo.colors(n = 7)) # col<-c(1, rep(2,2), rep(3,4), rep(4, 8), rep(5,16 ), rep(6,32 ), rep(7,64) ) # plot(1:508,predictions,type="p",col=col,pch=16,cex=0.8, # xlab="variable number",ylab="log importances") # lines(1:508,predictions,col = "gray62",lwd=0.5) # abline(v=127,col="green") # # roccurve <- roc(c(rep(1,127),rep(0, 508-127)),predictions) # plot(roccurve) # auc(roccurve) # 0.9806 # # library(ROCR) # labels <- c(rep(1,127),rep(0,508-127)) # pred <- prediction(predictions, labels) # cost_perf = performance(pred, "cost") # pred@cutoffs[[1]][which.min(cost_perf@y.values[[1]])] # #0.0002388418 # table(SHAP.global> 0.0002388418 ,c(rep(1,127),rep(0,508-127))) # # 0 1 # # FALSE 363 11 # # TRUE 18 116 # # predicted_values <-rep(0,508);predicted_values[ which(predictions> 0.0002388418 )]<-1 # conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,508-127))) # conf_matrix # conf_matrix[2,1]/sum(conf_matrix[2,]) #[1]] 0.1343284 FDR # sensitivity(conf_matrix) # 0.9527559 TP/(TP+FN) # specificity(conf_matrix) # 0.9133858 TN/(FP+TN) #