## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE, #this makes so that chunks are not rerun cache = TRUE, warning = FALSE) ## ----------------------------------------------------------------------------- # # install.packages("devtools") # devtools::install_github("dzhakparov/GeneSelectR", build_vignettes = FALSE) # ## ----------------------------------------------------------------------------- # GeneSelectR::configure_environment() ## ----------------------------------------------------------------------------- # GeneSelectR::set_reticulate_python() # library(GeneSelectR) # # rest of your code ## ----------------------------------------------------------------------------- # data("UrbanRandomSubset") # head(UrbanRandomSubset[,1:10]) ## ----------------------------------------------------------------------------- # GeneSelectR::set_reticulate_python() # library(GeneSelectR) # library(dplyr) # # the rest of the code ## ----------------------------------------------------------------------------- # X <- UrbanRandomSubset %>% dplyr::select(-treatment) # get the feature matrix # y <- UrbanRandomSubset['treatment'] # store the data point label in a separate vector ## ----------------------------------------------------------------------------- # y <- as.factor(y) # y <- as.integer(y) ## ----------------------------------------------------------------------------- # y <- as.vector(y) ## ----------------------------------------------------------------------------- # selection_results <- GeneSelectR::GeneSelectR(X = X, # y = y, # njobs = -1) # all cores will be used # selection_results ## ----------------------------------------------------------------------------- # fs_param_grids <- list( # "Lasso" = list( # "feature_selector__estimator__C" = c(0.01, 0.1, 1L, 10L), # "feature_selector__estimator__solver" = c('liblinear','saga') # ), # "Univariate" = list( # "feature_selector__param" = seq(50L, 200L, by = 50L) # ), # "boruta" = list( # "feature_selector__perc" = seq(80L, 100L, by = 10L), # 'feature_selector__n_estimators' = c(50L, 100L, 250L, 500L) # ), # "RandomForest" = list( # "feature_selector__estimator__n_estimators" = seq(100L, 500L,by = 50L), # "feature_selector__estimator__max_depth" = c(10L, 20L, 30L), # "feature_selector__estimator__min_samples_split" = c(2L, 5L, 10L), # "feature_selector__estimator__min_samples_leaf" = c(1L, 2L, 4L), # "feature_selector__estimator__bootstrap" = c(TRUE, FALSE) # ) # ) ## ----------------------------------------------------------------------------- # selection_results <- GeneSelectR::GeneSelectR(X = X, # y = y, # njobs = -1, # search_type = 'grid') ## ----------------------------------------------------------------------------- # selection_results <- GeneSelectR::GeneSelectR(X = X, # y = y, # njobs = -1, # search_type = 'bayesian') ## ----------------------------------------------------------------------------- # # sklearn is already imported when the library is loaded # # define the feature selection submodule and wanted methods with an estimator # feature_selection <- sklearn$feature_selection # select_from_model <- feature_selection$SelectFromModel # RFE <- feature_selection$RFE # rf <- sklearn$ensemble$RandomForestClassifier # # # feature selection methods of your choice # my_methods <- list('RFE' = RFE(estimator = rf(), n_features_to_select = 100L), # 'SelectFromModel' = select_from_model(estimator = rf())) # ## ----------------------------------------------------------------------------- # # params for the feature selection methods of your choice # my_params <- list('RFE' = list('feature_selector__step' = seq(0.1, 0.001, 1, 10)), # 'SelectFromModel' = list('feature_selector__estimator__n_estimators' = c(50L, 100L, 250L, 500L), # "feature_selector__estimator__max_depth" = c(10L, 20L, 30L), # "feature_selector__estimator__bootstrap" = c(TRUE, FALSE)) # ) ## ----------------------------------------------------------------------------- # selection_results <- GeneSelectR::GeneSelectR(X = X, # y = y, # njobs = -1, # feature_selection_methods=my_methods, # fs_param_grids = my_params) ## ----------------------------------------------------------------------------- # minmax <- sklearn$preprocessing$MinMaxScaler() # varthr <- sklearn$feature_selection$VarianceThreshold() # preprocessing <- list( 'MinMaxScaler' = minmax, # 'VarianceThreshold' = varthr) # # selection_results <- GeneSelectR::GeneSelectR(X = X, # y = y, # njobs = -1, # feature_selection_methods=my_methods, # fs_param_grids = my_params, # preprocessing_steps = preprocessing) ## ----------------------------------------------------------------------------- # # import xgboost # # NOTE: it should be installed in the working conda env # xgb <- reticulate::import('xgboost') # xgb.classifier <- xgb$XGBClassifier() # # selection_results <- GeneSelectR::GeneSelectR(X = X, # y = y, # njobs = -1, # classifier = xgb.classifier) ## ----------------------------------------------------------------------------- # xgb <- reticulate::import('xgboost') # xgb.classifier <- xgb$XGBClassifier() # # xgb_param_grid <- list( # "classifier__learning_rate" = c(0.01, 0.05, 0.1), # "classifier__n_estimators" = c(100L, 200L, 300L), # "classifier__max_depth" = c(3L, 5L, 7L), # "classifier__min_child_weight" = c(1L, 3L, 5L), # "classifier__gamma" = c(0, 0.1, 0.2), # "classifier__subsample" = c(0.8, 1.0), # "classifier__colsample_bytree" = c(0.8, 1.0) # ) # # selection_results <- GeneSelectR::GeneSelectR(X = X, # y = y, # njobs = -1, # classifier = xgb.classifier, # classifier_grid = xgb_param_grid) ## ----------------------------------------------------------------------------- # plot_feature_importance(selection_results, top_n_features = 10) ## ----------------------------------------------------------------------------- # plot_metrics(selection_results) # # # or access it as a dataframe # selection_results$test_metrics # selection_results$cv_mean_score ## ----------------------------------------------------------------------------- # overlap <- calculate_overlap_coefficients(selection_results) # overlap ## ----------------------------------------------------------------------------- # plot_overlap_heatmaps(overlap) ## ----------------------------------------------------------------------------- # custom_list <- list(custom_list = c('char1','char2','char3','char4','char5'), # custom_list2 = c('char1','char2','char3','char4','char5')) # overlap1 <- calculate_overlap_coefficients(selection_results, custom_lists = custom_list) # plot_overlap_heatmaps(overlap1) ## ----------------------------------------------------------------------------- # upset_upet(selection_results) # # # plot upset with custom lists # upset_upet(selection_results, custom_lists = custom_list) # ## ----------------------------------------------------------------------------- # # Get the annotations for the genes in the analysis # # The example dataset contains sequencing from humans # ah <- AnnotationHub::AnnotationHub() # human_ens <- AnnotationHub::query(ah, c("Homo sapiens", "EnsDb")) # human_ens <- human_ens[['AH98047']] # annotations_ahb <- ensembldb::genes(human_ens, return.type = "data.frame") %>% # dplyr::select(gene_id,gene_name,entrezid,gene_biotype) # # annotations_df <- annotate_gene_lists(pipeline_results = selection_results, # annotations_ahb = annotations_ahb, # format = 'ENSEMBL') # annotations_df ## ----------------------------------------------------------------------------- # # Formal class 'AnnotatedGeneLists' [package "GeneSelectR"] with 2 slots # # ..@ inbuilt :List of 4 # # .. ..$ Lasso :Formal class 'GeneList' [package "GeneSelectR"] with 3 slots # # .. .. .. ..@ SYMBOL : chr [1:54] "MCOLN3" "AGL" "NBPF26" "EXO1" ... # # .. .. .. ..@ ENSEMBL : chr [1:54] "ENSG00000055732" "ENSG00000162688" "ENSG00000273136" "ENSG00000174371" ... # # .. .. .. ..@ ENTREZID: chr [1:54] "55283" "178" "101060684" "9156" ... # # .. ..$ Univariate :Formal class 'GeneList' [package "GeneSelectR"] with 3 slots # # .. .. .. ..@ SYMBOL : chr [1:74] "CDA" "BMP8B" "MCOLN3" "AGL" ... # # .. .. .. ..@ ENSEMBL : chr [1:74] "ENSG00000158825" "ENSG00000116985" "ENSG00000055732" "ENSG00000162688" ... # # .. .. .. ..@ ENTREZID: chr [1:74] "978" "656" "55283" "178" ... # # .. ..$ RandomForest:Formal class 'GeneList' [package "GeneSelectR"] with 3 slots # # .. .. .. ..@ SYMBOL : chr [1:73] "CDA" "BMP8B" "MCOLN3" "NBPF26" ... # # .. .. .. ..@ ENSEMBL : chr [1:73] "ENSG00000158825" "ENSG00000116985" "ENSG00000055732" "ENSG00000273136" ... # # .. .. .. ..@ ENTREZID: chr [1:73] "978" "656" "55283" "101060684" ... # # .. ..$ boruta :Formal class 'GeneList' [package "GeneSelectR"] with 3 slots # # .. .. .. ..@ SYMBOL : chr [1:36] "MCOLN3" "AGL" "NBPF26" "BNIPL" ... # # .. .. .. ..@ ENSEMBL : chr [1:36] "ENSG00000055732" "ENSG00000162688" "ENSG00000273136" "ENSG00000163141" ... # # .. .. .. ..@ ENTREZID: chr [1:36] "55283" "178" "101060684" "149428" ... # # ..@ permutation:List of 4 # # .. ..$ Lasso :Formal class 'GeneList' [package "GeneSelectR"] with 3 slots # # .. .. .. ..@ SYMBOL : chr [1:4] "HBB" "" "ABCC12" "LTF" # # .. .. .. ..@ ENSEMBL : chr [1:4] "ENSG00000244734" "ENSG00000259529" "ENSG00000140798" "ENSG00000012223" # # .. .. .. ..@ ENTREZID: chr [1:4] "3043" "NA" "94160" "4057" # # .. ..$ Univariate :Formal class 'GeneList' [package "GeneSelectR"] with 3 slots # # .. .. .. ..@ SYMBOL : chr [1:19] "KAZN" "RGS16" "IGFN1" "CHI3L1" ... # # .. .. .. ..@ ENSEMBL : chr [1:19] "ENSG00000189337" "ENSG00000143333" "ENSG00000163395" "ENSG00000133048" ... # # .. .. .. ..@ ENTREZID: chr [1:19] "23254" "6004" "91156" "1116" ... # # .. ..$ RandomForest:Formal class 'GeneList' [package "GeneSelectR"] with 3 slots # # .. .. .. ..@ SYMBOL : chr [1:16] "ZBED6" "CYP2E1" "HBB" "OTOGL" ... # # .. .. .. ..@ ENSEMBL : chr [1:16] "ENSG00000257315" "ENSG00000130649" "ENSG00000244734" "ENSG00000165899" ... # # .. .. .. ..@ ENTREZID: chr [1:16] "100381270" "1571" "3043" "283310" ... # # .. ..$ boruta :Formal class 'GeneList' [package "GeneSelectR"] with 3 slots # # .. .. .. ..@ SYMBOL : chr [1:8] "CHI3L1" "HBB" "IDH1" "PTPRN" ... # # .. .. .. ..@ ENSEMBL : chr [1:8] "ENSG00000133048" "ENSG00000244734" "ENSG00000138413" "ENSG00000054356" ... # # .. .. .. ..@ ENTREZID: chr [1:8] "1116" "3043" "3417" "5798" ... ## ----------------------------------------------------------------------------- # print(annotated_df@inbuilt$RandomForest@ENTREZID) ## ----------------------------------------------------------------------------- # custom_list <- list(background = c('gene1', 'gene2', 'gene3', 'gene4'), # DEGs = c('gene5','gene6','gene7')) # annotations_df_with_custom <- annotate_gene_lists(pipeline_results = selection_results, # annotations_ahb = annotations_ahb, # format = 'ensembl_gene_name', # custom_lists = custom_list) # annotations_df_with_custom ## ----------------------------------------------------------------------------- # annotated_GO <- GO_enrichment_analysis(annotated_df) ## ----------------------------------------------------------------------------- # annotated_GO <- GO_enrichment_analysis(annotated_df, # list_type = 'permutation', #run GO enrichment on permutation based selected features # keyType = 'ENSEMBL', # run analysis with ENSEMBLIDs # ont = 'BP') # run BP ontology # annotated_GO ## ----------------------------------------------------------------------------- # annot_child_fractions <- compute_GO_child_term_metrics(GO_data = annotated_GO, # GO_terms = c("GO:0002376", "GO:0044419"), # plot = FALSE) ## ----------------------------------------------------------------------------- # str(annot_child_fractions) # # 'data.frame': 12 obs. of 6 variables: # # $ feature_list : chr "Lasso" "Univariate" "RandomForest" "boruta" ... # # $ all_terms_number : int 5413 5421 5981 4643 1308 1752 5413 5421 5981 4643 ... # # $ offspring_nodes_number: int 432 449 495 379 196 170 112 110 128 100 ... # # $ offspring_terms : chr "GO:0001767;GO:0001768;GO:0001771;GO:0001773;GO:0001774;GO:0001776;GO:0001779;GO:0001780;GO:0001782;GO:0001787;G"| __truncated__ "GO:0001767;GO:0001768;GO:0001771;GO:0001773;GO:0001774;GO:0001776;GO:0001780;GO:0001782;GO:0001787;GO:0001909;G"| __truncated__ "GO:0001767;GO:0001768;GO:0001771;GO:0001773;GO:0001774;GO:0001776;GO:0001779;GO:0001782;GO:0001787;GO:0001865;G"| __truncated__ "GO:0001767;GO:0001768;GO:0001771;GO:0001773;GO:0001774;GO:0001776;GO:0001779;GO:0001780;GO:0001782;GO:0001865;G"| __truncated__ ... # # $ fraction : num 7.98 8.28 8.28 8.16 14.98 ... # # $ GO_term : chr "GO:0002376" "GO:0002376" "GO:0002376" "GO:0002376" ... ## ----------------------------------------------------------------------------- # # simplifyEnrichment produces a heatmap # hmap <- run_simplify_enrichment(annotated_GO, # method = 'lovain', # measure = 'Resnik')