## ----setup, include=FALSE, echo=F, warning= F, message=F---------------------- knitr::opts_chunk$set( message = FALSE, warning = FALSE, error = FALSE, tidy = FALSE, fig.align = "center", dpi = 150, cache = FALSE, progress = FALSE, quite = TRUE ) ## ---- echo=FALSE, results="hide", warning=FALSE,message=FALSE----------------- library(TCGAWorkflow) ## ---- eval=FALSE, include=TRUE------------------------------------------------ # if (!"BiocManager" %in% rownames(installed.packages())) # install.packages("BiocManager") # BiocManager::install("TCGAWorkflow") # BiocManager::install("TCGAWorkflowData") ## ---- eval=TRUE, message=FALSE, warning=FALSE, include=TRUE------------------- library(TCGAWorkflowData) library(DT) ## ---- eval=TRUE, message=FALSE, warning=FALSE, include=FALSE------------------ library(TCGAbiolinks) ## ---- eval=FALSE, include=TRUE------------------------------------------------ # library(TCGAbiolinks) # query_met_gbm <- GDCquery( # project = "TCGA-GBM", # data.category = "DNA Methylation", # data.type = "Methylation Beta Value", # platform = "Illumina Human Methylation 450", # barcode = c("TCGA-76-4926-01B-01D-1481-05", "TCGA-28-5211-01C-11D-1844-05") # ) # GDCdownload(query_met_gbm) # # met_gbm_450k <- GDCprepare( # query = query_met_gbm, # summarizedExperiment = TRUE # ) # # query_met_lgg <- GDCquery( # project = "TCGA-LGG", # data.category = "DNA Methylation", # data.type = "Methylation Beta Value", # platform = "Illumina Human Methylation 450", # barcode = c("TCGA-HT-7879-01A-11D-2399-05", "TCGA-HT-8113-01A-11D-2399-05") # ) # GDCdownload(query_met_lgg) # met_lgg_450k <- GDCprepare( # query = query_met_lgg, # summarizedExperiment = TRUE # ) # # met_lgg_450k$days_to_death <- NA # met_lgg_450k$year_of_death <- NA # met_gbm_lgg <- SummarizedExperiment::cbind( # met_lgg_450k, # met_gbm_450k # ) # # # # A total of 2.27 GB # query_exp_lgg <- GDCquery( # project = "TCGA-LGG", # data.category = "Transcriptome Profiling", # data.type = "Gene Expression Quantification", # workflow.type = "STAR - Counts" # ) # # GDCdownload(query_exp_lgg) # exp_lgg <- GDCprepare( # query = query_exp_lgg # ) # # query_exp_gbm <- GDCquery( # project = "TCGA-GBM", # data.category = "Transcriptome Profiling", # data.type = "Gene Expression Quantification", # workflow.type = "STAR - Counts" # ) # GDCdownload(query_exp_gbm) # exp_gbm <- GDCprepare( # query = query_exp_gbm # ) # # # The following clinical data is not available in GBM # missing_cols <- setdiff(colnames(colData(exp_lgg)),colnames(colData(exp_gbm))) # for(i in missing_cols){ # exp_lgg[[i]] <- NULL # } # # exp_gbm_lgg <- SummarizedExperiment::cbind( # exp_lgg, # exp_gbm # ) ## ---- eval=FALSE, include=TRUE------------------------------------------------ # #----------------------------------------------------------------------------- # # Data.category: Copy number variation aligned to hg38 # #----------------------------------------------------------------------------- # query <- GDCquery( # project = "TCGA-ACC", # data.category = "Copy Number Variation", # data.type = "Copy Number Segment", # barcode = c( "TCGA-OR-A5KU-01A-11D-A29H-01", "TCGA-OR-A5JK-01A-11D-A29H-01") # ) # GDCdownload(query) # data <- GDCprepare(query) # # query <- GDCquery( # project = "TCGA-ACC", # data.category = "Copy Number Variation", # data.type = "Masked Copy Number Segment", # sample.type = c("Primary Tumor") # ) # see the barcodes with getResults(query)$cases # GDCdownload(query) # data <- GDCprepare(query) ## ---- eval=TRUE, include=TRUE------------------------------------------------- library(SummarizedExperiment) # Load object from TCGAWorkflowData package # This object will be created in subsequent sections for enhanced clarity and understanding. data(TCGA_GBM_Transcriptome_20_samples) # get gene expression matrix data <- assay(exp_gbm) datatable( data = data[1:10,], options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = TRUE ) # get genes information genes.info <- rowRanges(exp_gbm) genes.info # get sample information sample.info <- colData(exp_gbm) datatable( data = as.data.frame(sample.info), options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE ) ## ---- eval=TRUE, include=TRUE------------------------------------------------- # get indexed clinical patient data for GBM samples gbm_clin <- GDCquery_clinic( project = "TCGA-GBM", type = "Clinical" ) # get indexed clinical patient data for LGG samples lgg_clin <- GDCquery_clinic( project = "TCGA-LGG", type = "Clinical" ) # Bind the results, as the columns might not be the same, # we will will plyr rbind.fill, to have all columns from both files clinical <- plyr::rbind.fill( gbm_clin, lgg_clin ) ## ----echo = TRUE, message = FALSE, warning = FALSE---------------------------- datatable( clinical[1:10,], options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE ) ## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE---------------- # Fetch clinical data directly from the clinical XML files. # if barcode is not set, it will consider all samples. # We only set it to make the example faster query <- GDCquery( project = "TCGA-GBM", data.format = "bcr xml", data.category = "Clinical", barcode = c("TCGA-08-0516","TCGA-02-0317") ) GDCdownload(query) clinical <- GDCprepare_clinic( query = query, clinical.info = "patient" ) ## ----echo = TRUE, message = FALSE, warning = FALSE---------------------------- datatable( data = clinical, options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE ) ## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE---------------- clinical_drug <- GDCprepare_clinic( query = query, clinical.info = "drug" ) ## ----echo = TRUE, message = FALSE, warning = FALSE---------------------------- clinical_drug |> datatable( options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE ) ## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE---------------- clinical_radiation <- GDCprepare_clinic( query = query, clinical.info = "radiation" ) ## ----echo = TRUE, message = FALSE, warning = FALSE---------------------------- clinical_radiation |> datatable( options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE ) ## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE---------------- clinical_admin <- GDCprepare_clinic( query = query, clinical.info = "admin" ) ## ----echo = TRUE, message = FALSE, warning = FALSE---------------------------- clinical_admin |> datatable( options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE ) ## ---- eval=FALSE, include=TRUE------------------------------------------------ # query <- GDCquery( # project = c("TCGA-LGG","TCGA-GBM"), # data.category = "Simple Nucleotide Variation", # access = "open", # data.type = "Masked Somatic Mutation", # workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" # ) # GDCdownload(query) # maf <- GDCprepare(query) ## ----echo = TRUE, message = FALSE, warning = FALSE---------------------------- data(maf_lgg_gbm) maf[1:10,] |> datatable( options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE ) ## ---- eval=TRUE, include=TRUE------------------------------------------------- gbm_subtypes <- TCGAquery_subtype( tumor = "gbm" ) ## ----echo = TRUE, message = FALSE, warning = FALSE---------------------------- datatable( gbm_subtypes[1:10,], options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE ) ## ---- eval=FALSE, include=TRUE------------------------------------------------ # library(RTCGAToolbox) ## ---- eval=FALSE, include=TRUE------------------------------------------------ # # Get the last run dates # lastRunDate <- getFirehoseRunningDates()[1] # # # get DNA methylation data, RNAseq2 and clinical data for GBM # gbm_data <- getFirehoseData( # dataset = "GBM", # runDate = lastRunDate, # gistic2Date = getFirehoseAnalyzeDates(1), # Methylation = FALSE, # clinical = TRUE, # RNASeq2GeneNorm = FALSE, # Mutation = TRUE, # fileSizeLimit = 10000 # ) # # gbm_mut <- getData(gbm_data,"Mutation") # gbm_clin <- getData(gbm_data,"clinical") ## ---- eval=FALSE, message=FALSE, warning=FALSE, include=TRUE------------------ # # Download GISTIC results # lastanalyzedate <- getFirehoseAnalyzeDates(1) # gistic <- getFirehoseData( # dataset = "GBM", # GISTIC = TRUE, # gistic2Date = lastanalyzedate # ) # # # get GISTIC results # gistic_allbygene <- getData( # object = gistic, # type = "GISTIC", # platform = "AllByGene" # ) # gistic_thresholedbygene <- getData( # object = gistic, # type = "GISTIC", # platform = "ThresholdedByGene" # ) ## ---- eval=TRUE, message=FALSE, warning=FALSE, include=TRUE------------------- data(gbm_gistic) gistic_allbygene %>% head() %>% gt::gt() gistic_thresholedbygene %>% head() %>% gt::gt() ## ----results='hide', echo=TRUE, message=FALSE, warning=FALSE------------------ library(maftools) # recovering data from TCGAWorkflowData package. data(maf_lgg_gbm) # To prepare for maftools we will also include clinical data # For a mutant vs WT survival analysis # get indexed clinical patient data for GBM samples gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical") # get indexed clinical patient data for LGG samples lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical") # Bind the results, as the columns might not be the same, # we will will plyr rbind.fill, to have all columns from both files clinical <- plyr::rbind.fill(gbm_clin,lgg_clin) colnames(clinical)[grep("submitter_id",colnames(clinical))] <- "Tumor_Sample_Barcode" # we need to create a binary variable 1 is dead 0 is not dead plyr::count(clinical$vital_status) clinical$Overall_Survival_Status <- 1 # dead clinical$Overall_Survival_Status[which(clinical$vital_status != "Dead")] <- 0 # If patient is not dead we don't have days_to_death (NA) # we will set it as the last day we know the patient is still alive clinical$time <- clinical$days_to_death clinical$time[is.na(clinical$days_to_death)] <- clinical$days_to_last_follow_up[is.na(clinical$days_to_death)] # Create object to use in maftools maf <- read.maf( maf = maf, clinicalData = clinical, isTCGA = TRUE ) ## ----echo=TRUE, message=FALSE, warning=FALSE,fig.width=10--------------------- plotmafSummary( maf = maf, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE ) ## ----echo=TRUE, message=FALSE, warning=FALSE,fig.height=10,fig.width=15,eval=FALSE---- # oncoplot( # maf = maf, # top = 20, # legendFontSize = 8, # clinicalFeatures = c("tissue_or_organ_of_origin") # ) ## ----echo=TRUE, message=FALSE, warning=FALSE---------------------------------- plot <- mafSurvival( maf = maf, genes = "TP53", time = 'time', Status = 'Overall_Survival_Status', isTCGA = TRUE ) ## ---- eval=FALSE, include=TRUE------------------------------------------------ # query_exp_lgg <- GDCquery( # project = "TCGA-LGG", # data.category = "Transcriptome Profiling", # data.type = "Gene Expression Quantification", # workflow.type = "STAR - Counts" # ) # # Get only first 20 samples to make example faster # query_exp_lgg$results[[1]] <- query_exp_lgg$results[[1]][1:20,] # GDCdownload(query_exp_lgg) # exp_lgg <- GDCprepare( # query = query_exp_lgg # ) # # query_exp_gbm <- GDCquery( # project = "TCGA-GBM", # data.category = "Transcriptome Profiling", # data.type = "Gene Expression Quantification", # workflow.type = "STAR - Counts" # ) # # Get only first 20 samples to make example faster # query_exp_gbm$results[[1]] <- query_exp_gbm$results[[1]][1:20,] # GDCdownload(query_exp_gbm) # exp_gbm <- GDCprepare( # query = query_exp_gbm # ) ## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE------------------- data("TCGA_LGG_Transcriptome_20_samples") data("TCGA_GBM_Transcriptome_20_samples") exp_lgg_preprocessed <- TCGAanalyze_Preprocessing( object = exp_lgg, cor.cut = 0.6, datatype = "unstranded", filename = "LGG_IlluminaHiSeq_RNASeqV2.png" ) exp_gbm_preprocessed <- TCGAanalyze_Preprocessing( object = exp_gbm, cor.cut = 0.6, datatype = "unstranded", filename = "GBM_IlluminaHiSeq_RNASeqV2.png" ) exp_preprocessed <- cbind( exp_lgg_preprocessed, exp_gbm_preprocessed ) exp_normalized <- TCGAanalyze_Normalization( tabDF = cbind(exp_lgg_preprocessed, exp_gbm_preprocessed), geneInfo = TCGAbiolinks::geneInfoHT, method = "gcContent" ) # 60513 40 exp_filtered <- TCGAanalyze_Filtering( tabDF = exp_normalized, method = "quantile", qnt.cut = 0.25 ) # 44630 40 exp_filtered_lgg <- exp_filtered[ ,substr(colnames(exp_filtered),1,12) %in% lgg_clin$bcr_patient_barcode ] exp_filtered_gbm <- exp_filtered[ ,substr(colnames(exp_filtered),1,12) %in% gbm_clin$bcr_patient_barcode ] diff_expressed_genes <- TCGAanalyze_DEA( mat1 = exp_filtered_lgg, mat2 = exp_filtered_gbm, Cond1type = "LGG", Cond2type = "GBM", fdr.cut = 0.01 , logFC.cut = 1, method = "glmLRT" ) ## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE------------------- # Number of differentially expressed genes (DEG) nrow(diff_expressed_genes) ## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=10, fig.width=10---- #------------------- 4.2 EA: enrichment analysis -------------------- ansEA <- TCGAanalyze_EAcomplete( TFname = "DEA genes LGG Vs GBM", RegulonList = diff_expressed_genes$gene_name ) TCGAvisualize_EAbarplot( tf = rownames(ansEA$ResBP), filename = NULL, GOBPTab = ansEA$ResBP, nRGTab = diff_expressed_genes$gene_name, nBar = 20 ) ## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=10, fig.width=10---- TCGAvisualize_EAbarplot( tf = rownames(ansEA$ResBP), filename = NULL, GOCCTab = ansEA$ResCC, nRGTab = diff_expressed_genes$gene_name, nBar = 20 ) ## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=10, fig.width=10---- TCGAvisualize_EAbarplot( tf = rownames(ansEA$ResBP), filename = NULL, GOMFTab = ansEA$ResMF, nRGTab = diff_expressed_genes$gene_name, nBar = 20 ) ## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=12, fig.width=15---- TCGAvisualize_EAbarplot( tf = rownames(ansEA$ResBP), filename = NULL, PathTab = ansEA$ResPat, nRGTab = rownames(diff_expressed_genes), nBar = 20 ) ## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE------------------- library(SummarizedExperiment) # DEGs TopTable dataDEGsFiltLevel <- TCGAanalyze_LevelTab( FC_FDR_table_mRNA = diff_expressed_genes, typeCond1 = "LGG", typeCond2 = "GBM", TableCond1 = exp_filtered[,colnames(exp_filtered_lgg)], TableCond2 = exp_filtered[,colnames(exp_filtered_gbm)] ) dataDEGsFiltLevel$GeneID <- 0 library(clusterProfiler) # Converting Gene symbol to geneID eg = as.data.frame( bitr( dataDEGsFiltLevel$mRNA, fromType = "ENSEMBL", toType = c("ENTREZID","SYMBOL"), OrgDb = "org.Hs.eg.db" ) ) eg <- eg[!duplicated(eg$SYMBOL),] eg <- eg[order(eg$SYMBOL,decreasing=FALSE),] dataDEGsFiltLevel <- dataDEGsFiltLevel[dataDEGsFiltLevel$mRNA %in% eg$ENSEMBL,] dataDEGsFiltLevel <- dataDEGsFiltLevel[eg$ENSEMBL,] rownames(dataDEGsFiltLevel) <- eg$SYMBOL all(eg$SYMBOL == rownames(dataDEGsFiltLevel)) dataDEGsFiltLevel$GeneID <- eg$ENTREZID dataDEGsFiltLevel_sub <- subset(dataDEGsFiltLevel, select = c("GeneID", "logFC")) genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC) names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID library(pathview) # pathway.id: hsa05214 is the glioma pathway # limit: sets the limit for gene expression legend and color hsa05214 <- pathview::pathview( gene.data = genelistDEGs, pathway.id = "hsa05214", species = "hsa", limit = list(gene = as.integer(max(abs(genelistDEGs)))) ) ## ---- eval=FALSE, include=TRUE------------------------------------------------ # ### read biogrid info (available in TCGAWorkflowData as "biogrid") # ### Check last version in https://thebiogrid.org/download.php # file <- "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-ALL-LATEST.tab2.zip" # if(!file.exists(gsub("zip","txt",basename(file)))){ # downloader::download(file,basename(file)) # unzip(basename(file),junkpaths =TRUE) # } # # tmp.biogrid <- vroom::vroom( # dir(pattern = "BIOGRID-ALL.*\\.txt") # ) ## ---- eval=TRUE, include=TRUE------------------------------------------------- ### plot details (colors & symbols) mycols <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628') ### load network inference libraries library(minet) library(c3net) ### deferentially identified genes using TCGAbiolinks # we will use only a subset (first 50 genes) of it to make the example faster names.genes.de <- rownames(diff_expressed_genes)[1:30] data(biogrid) net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de) mydata <- exp_filtered_lgg[names.genes.de, ] ### infer networks t.mydata <- t(mydata) net.aracne <- minet(t.mydata, method = "aracne") net.mrnet <- minet(t.mydata) net.clr <- minet(t.mydata, method = "clr") net.c3net <- c3net(mydata) ### validate compared to biogrid network tmp.val <- list( validate(net.aracne, net.biogrid.de), validate(net.mrnet, net.biogrid.de), validate(net.clr, net.biogrid.de), validate(net.c3net, net.biogrid.de) ) ### plot roc and compute auc for the different networks dev1 <- show.roc(tmp.val[[1]],cex=0.3,col=mycols[1],type="l") res.auc <- auc.roc(tmp.val[[1]]) for(count in 2:length(tmp.val)){ show.roc(tmp.val[[count]],device=dev1,cex=0.3,col=mycols[count],type="l") res.auc <- c(res.auc, auc.roc(tmp.val[[count]])) } legend( "bottomright", legend = paste(c("aracne","mrnet","clr","c3net"), signif(res.auc,4), sep=": "), col = mycols[1:length(tmp.val)], lty = 1, bty = "n" ) # Please, uncomment this line to produce the pdf files. # dev.copy2pdf(width=8,height=8,device = dev1, file = paste0("roc_biogrid_",cancertype,".pdf")) ## ---- eval=FALSE, include=TRUE------------------------------------------------ # #---------------------------- # # Obtaining DNA methylation # #---------------------------- # # Samples # lgg.samples <- matchedMetExp("TCGA-LGG", n = 10) # gbm.samples <- matchedMetExp("TCGA-GBM", n = 10) # samples <- c(lgg.samples,gbm.samples) # # #----------------------------------- # # 1 - Methylation # # ---------------------------------- # # For DNA methylation it is quicker in this case to download the tar.gz file # # and get the samples we want instead of downloading files by files # query <- GDCquery( # project = c("TCGA-LGG","TCGA-GBM"), # data.category = "DNA Methylation", # platform = "Illumina Human Methylation 450", # data.type = "Methylation Beta Value", # barcode = samples # ) # GDCdownload(query) # met <- GDCprepare( # query = query, # save = FALSE # ) # # # We will use only chr9 to make the example faster # met <- subset(met,subset = as.character(seqnames(met)) %in% c("chr9")) # # This data is avaliable in the package (object elmerExample) ## ----echo=TRUE, message=FALSE,warning=FALSE, fig.height=8,fig.width=8--------- data(elmerExample) #---------------------------- # Mean methylation #---------------------------- # Plot a barplot for the groups in the disease column in the # summarizedExperiment object # remove probes with NA (similar to na.omit) met <- met[rowSums(is.na(assay(met))) == 0,] df <- data.frame( "Sample.mean" = colMeans(assay(met), na.rm = TRUE), "groups" = met$project_id ) library(ggpubr) ggpubr::ggboxplot( data = df, y = "Sample.mean", x = "groups", color = "groups", add = "jitter", ylab = "Mean DNA methylation (beta-values)", xlab = "" ) + stat_compare_means() ## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE------------------- #------- Searching for differentially methylated CpG sites ---------- dmc <- TCGAanalyze_DMC( data = met, groupCol = "project_id", # a column in the colData matrix group1 = "TCGA-GBM", # a type of the disease type column group2 = "TCGA-LGG", # a type of the disease column p.cut = 0.05, diffmean.cut = 0.15, save = FALSE, legend = "State", plot.filename = "LGG_GBM_metvolcano.png", cores = 1 # if set to 1 there will be a progress bar ) ## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE------------------- #-------------------------- # DNA Methylation heatmap #------------------------- library(ComplexHeatmap) clinical <- plyr::rbind.fill( gbm_clin, lgg_clin ) # get the probes that are Hypermethylated or Hypomethylated # met is the same object of the section 'DNA methylation analysis' status.col <- "status" probes <- rownames(dmc)[grep("hypo|hyper",dmc$status,ignore.case = TRUE)] sig.met <- met[probes,] # top annotation, which samples are LGG and GBM # We will add clinical data as annotation of the samples # we will sort the clinical data to have the same order of the DNA methylation matrix clinical.ordered <- clinical[match(substr(colnames(sig.met),1,12),clinical$bcr_patient_barcode),] ta <- HeatmapAnnotation( df = clinical.ordered[, c("primary_diagnosis", "gender", "vital_status", "race")], col = list( disease = c("LGG" = "grey", "GBM" = "black"), gender = c("male" = "blue", "female" = "pink") ) ) # row annotation: add the status for LGG in relation to GBM # For exmaple: status.gbm.lgg Hypomethyated means that the # mean DNA methylation of probes for lgg are hypomethylated # compared to GBM ones. ra = rowAnnotation( df = dmc[probes, status.col], col = list( "status.TCGA.GBM.TCGA.LGG" = c( "Hypomethylated" = "orange", "Hypermethylated" = "darkgreen" ) ), width = unit(1, "cm") ) heatmap <- Heatmap( matrix = assay(sig.met), name = "DNA methylation", col = matlab::jet.colors(200), show_row_names = FALSE, cluster_rows = TRUE, cluster_columns = FALSE, show_column_names = FALSE, bottom_annotation = ta, column_title = "DNA Methylation" ) # Save to pdf png("heatmap.png",width = 600, height = 400) draw(heatmap, annotation_legend_side = "bottom") dev.off() ## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE------------------- library(rGADEM) library(BSgenome.Hsapiens.UCSC.hg19) library(motifStack) library(SummarizedExperiment) library(dplyr) probes <- rowRanges(met)[rownames(dmc)[grep("hypo|hyper",dmc$status,ignore.case = TRUE)],] # Get hypo/hyper methylated probes and make a 200bp window # surrounding each probe. sequence <- GRanges( seqnames = as.character(seqnames(probes)), IRanges( start = ranges(probes) %>% as.data.frame() %>% dplyr::pull("start") - 100, end = ranges(probes) %>% as.data.frame() %>% dplyr::pull("end") + 100), strand = "*" ) #look for motifs gadem <- GADEM(sequence, verbose = FALSE, genome = Hsapiens) # How many motifs were found? nMotifs(gadem) # get the number of occurrences nOccurrences(gadem) # view all sequences consensus consensus(gadem) # Print motif pwm <- getPWM(gadem) pfm <- new("pfm",mat = pwm[[1]],name = "Novel Site 1") plotMotifLogo(pfm) # Number of instances of motif 1? length(gadem@motifList[[1]]@alignList) ## ----table4, echo=FALSE, message=FALSE, warnings=FALSE, results='asis'-------- tabl <- " | Histone marks | Role | |:----------------------------------------------:|:-------------------------------------------------------------------------------------------------------:| | Histone H3 lysine 4 trimethylation (H3K4me3) | Promoter regions [@heintzman2007distinct,@bernstein2005genomic] | | Histone H3 lysine 4 monomethylation (H3K4me1) | Enhancer regions [@heintzman2007distinct] | | Histone H3 lysine 36 trimethylation (H3K36me3) | Transcribed regions | | Histone H3 lysine 27 trimethylation (H3K27me3) | Polycomb repression [@bonasio2010molecular] | | Histone H3 lysine 9 trimethylation (H3K9me3) | Heterochromatin regions [@peters2003partitioning] | | Histone H3 acetylated at lysine 27 (H3K27ac) | Increase activation of genomic elements [@heintzman2009histone,@rada2011unique,@creyghton2010histone] | | Histone H3 lysine 9 acetylation (H3K9ac) | Transcriptional activation [@nishida2006histone] | " cat(tabl) ## ----table5, echo=FALSE, message=FALSE, warnings=FALSE, results='asis'-------- tabl <- " | File | Description | |:----------------------------------:|:----------------------------------------------------------------------:| | fc.signal.bigwig | Bigwig File containing fold enrichment signal tracks | | pval.signal.bigwig | Bigwig File containing -log10(p-value) signal tracks | | hotspot.fdr0.01.broad.bed.gz | Broad domains on enrichment for DNase-seq for consolidated epigenomes | | hotspot.broad.bed.gz | Broad domains on enrichment for DNase-seq for consolidated epigenomes | | broadPeak.gz | Broad ChIP-seq peaks for consolidated epigenomes | | gappedPeak.gz | Gapped ChIP-seq peaks for consolidated epigenomes | | narrowPeak.gz | Narrow ChIP-seq peaks for consolidated epigenomes | | hotspot.fdr0.01.peaks.bed.gz | Narrow DNasePeaks for consolidated epigenomes | | hotspot.all.peaks.bed.gz | Narrow DNasePeaks for consolidated epigenomes | | .macs2.narrowPeak.gz | Narrow DNasePeaks for consolidated epigenomes | | coreMarks_mnemonics.bed.gz | 15 state chromatin segmentations | | mCRF_FractionalMethylation.bigwig | MeDIP/MRE(mCRF) fractional methylation calls | | RRBS_FractionalMethylation.bigwig | RRBS fractional methylation calls | | WGBS_FractionalMethylation.bigwig | Whole genome bisulphite fractional methylation calls | " cat(tabl) ## ----results='hide', eval=TRUE, echo=TRUE, message=FALSE,warning=FALSE-------- library(ChIPseeker) library(pbapply) library(ggplot2) ## ----results='hide', eval=FALSE, echo=TRUE, message=FALSE,warning=FALSE------- # #------------------ Working with ChipSeq data --------------- # # Step 1: download histone marks for a brain and non-brain samples. # #------------------------------------------------------------ # # loading annotation hub database # library(AnnotationHub) # ah = AnnotationHub() # # # Searching for brain consolidated epigenomes in the roadmap database # bpChipEpi_brain <- query(ah , c("EpigenomeRoadMap", "narrowPeak", "chip", "consolidated","brain","E068")) # # Get chip-seq data # histone.marks <- pblapply(names(bpChipEpi_brain), function(x) {ah[[x]]}) # names(histone.marks) <- names(bpChipEpi_brain) # # OBS: histone.marks is available in TCGAWorkflowData package ## ----getTagMatrix, results='hide', echo=TRUE, message=FALSE,warning=FALSE----- data(histoneMarks) # Create a GR object based on the hypo/hypermethylated probes. probes <- keepStandardChromosomes( rowRanges(met)[rownames(dmc)[dmc$status %in% c("Hypermethylated in TCGA-GBM", "Hypomethylated in TCGA-GBM")],] ) # Defining a window of 3kbp - 3kbp_probe_3kbp # to make it work with ChIPseeker package version "1.31.3.900" attributes(probes)$type <- "start_site" attributes(probes)$downstream <- 3000 attributes(probes)$upstream <- 3000 probes <- GenomicRanges::resize(probes,6001,fix = "center") ### Profile of ChIP peaks binding to TSS regions # First of all, to calculate the profile of ChIP peaks binding to TSS regions, we should # prepare the TSS regions, which are defined as the flanking sequence of the TSS sites. # Then align the peaks that are mapping to these regions and generate the tagMatrix. tagMatrixList <- pbapply::pblapply(histone.marks, function(x) { getTagMatrix(keepStandardChromosomes(x), windows = probes, weightCol = "score") }) # change names retrieved with the following command: basename(bpChipEpi_brain$title) names(tagMatrixList) <- c("H3K4me1","H3K4me3", "H3K9ac", "H3K9me3", "H3K27ac", "H3K27me3", "H3K36me3") ## ----tagHeatmap, eval=FALSE, include=TRUE------------------------------------- # tagHeatmap(tagMatrixList) ## ----results='asis', echo=FALSE, fig.width = 20, message=FALSE,warning=FALSE---- tagHeatmap(tagMatrixList) ## ----plotAvgProf, eval=FALSE, include=TRUE------------------------------------ # p <- plotAvgProf(tagMatrixList, xlim = c(-3000,3000), xlab = "Genomic Region (5'->3', centered on CpG)") # # We are centreing in the CpG instead of the TSS. So we'll change the labels manually # p <- p + scale_x_continuous( # breaks = c(-3000,-1500,0,1500,3000), # labels = c(-3000,-1500,"CpG",1500,3000) # ) # # library(ggthemes) # p + theme_few() + scale_colour_few(name = "Histone marks") + guides(colour = guide_legend(override.aes = list(size=4))) ## ----results='asis', echo=FALSE, message=FALSE,warning=FALSE------------------ p <- plotAvgProf(tagMatrixList, xlim = c(-3000,3000), xlab = "Genomic Region (5'->3', centered on CpG)") # We are centreing in the CpG instead of the TSS. So we'll change the labels manually if (requireNamespace("ggplot2", quietly = TRUE)) { p <- p + ggplot2::scale_x_continuous( breaks = c(-3000,-1500,0,1500,3000), labels = c(-3000,-1500,"CpG",1500,3000) ) } if (requireNamespace("ggthemes", quietly = TRUE)){ p + ggthemes::theme_few() + ggthemes::scale_colour_few(name="Histone marks") + ggplot2::guides(colour = guide_legend(override.aes = list(size=4))) } ## ---- eval=FALSE, include=TRUE------------------------------------------------ # #----------- 8.3 Identification of Regulatory Enhancers ------- # library(TCGAbiolinks) # # Samples: primary solid tumor w/ DNA methylation and gene expression # lgg.samples <- matchedMetExp("TCGA-LGG", n = 10) # gbm.samples <- matchedMetExp("TCGA-GBM", n = 10) # samples <- c(lgg.samples,gbm.samples) # # #----------------------------------- # # 1 - Methylation # # ---------------------------------- # query_met <- GDCquery( # project = c("TCGA-LGG","TCGA-GBM"), # data.category = "DNA Methylation", # platform = "Illumina Human Methylation 450", # data.type = "Methylation Beta Value", # barcode = samples # ) # GDCdownload(query_met) # met <- GDCprepare(query_met, save = FALSE) # met <- subset(met,subset = as.character(GenomicRanges::seqnames(met)) %in% c("chr9")) # # #----------------------------------- # # 2 - Expression # # ---------------------------------- # query_exp <- GDCquery( # project = c("TCGA-LGG","TCGA-GBM"), # data.category = "Gene expression", # data.type = "Gene expression quantification", # platform = "Illumina HiSeq", # file.type = "results", # legacy = TRUE, # barcode = samples # ) # GDCdownload(query_exp) # exp <- GDCprepare(query_exp, save = FALSE) # save(exp, met, gbm.samples, lgg.samples, file = "elmer.example.rda", compress = "xz") ## ----elmer, message = FALSE,warning = FALSE----------------------------------- library(ELMER) library(MultiAssayExperiment) library(GenomicRanges) distal.probes <- get.feature.probe( genome = "hg19", met.platform = "450K" ) # Recover the data created in the last step data(elmerExample) rownames(exp) <- values(exp)$ensembl_gene_id mae <- createMAE( exp = assay(exp), met = met, save = TRUE, linearize.exp = TRUE, save.filename = "mae.rda", filter.probes = distal.probes, met.platform = "450K", genome = "hg19", TCGA = TRUE ) mae ## ---- warning=FALSE, results="hide"------------------------------------------- cores <- 1 # you can use more cores if you want group.col <- "project_id" group1 <- "TCGA-GBM" group2 <- "TCGA-LGG" # Available directions are hypo and hyper, we will use only hypo # due to speed constraint direction <- "hypo" dir.out <- paste0("elmer/",direction) dir.create(dir.out, showWarnings = FALSE, recursive = TRUE) ## ---- warning=FALSE, results="hide"------------------------------------------- #-------------------------------------- # STEP 3: Analysis | #-------------------------------------- # Step 3.1: Get diff methylated probes | #-------------------------------------- message("Get diff methylated probes") Sig.probes <- get.diff.meth( data = mae, group.col = group.col, group1 = group1, group2 = group2, minSubgroupFrac = 1.0, # Use all samples sig.dif = 0.2, # defualt is 0.3 diff.dir = direction, # Search for hypomethylated probes in group 1 cores = cores, dir.out = dir.out, pvalue = 0.1 ) ## ---- warning=FALSE----------------------------------------------------------- datatable( data = Sig.probes[1:10,], options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = TRUE ) ## ---- warning=FALSE, results="hide"------------------------------------------- #------------------------------------------------------------- # Step 3.2: Identify significant probe-gene pairs | #------------------------------------------------------------- # Collect nearby 20 genes for Sig.probes message("Get nearby genes") nearGenes <- GetNearGenes( data = mae, numFlankingGenes = 4, # default is 20 genes probes = Sig.probes$probe ) ## ----------------------------------------------------------------------------- length(Sig.probes$probe) dim(nearGenes) head(nearGenes) ## ---- warning=FALSE, results="hide"------------------------------------------- message("Get anti correlated probes-genes") pair <- get.pair( data = mae, group.col = group.col, group1 = group1, group2 = group2, nearGenes = nearGenes, mode = "supervised", minSubgroupFrac = 1, # % of samples to use in to create groups U/M raw.pvalue = 0.1, # defualt is 0.001 Pe = 0.5, # Please set to 0.001 to get significant results filter.probes = TRUE, # See preAssociationProbeFiltering function filter.percentage = 0.05, save = FALSE, # Create CVS file filter.portion = 0.3, dir.out = dir.out, diff.dir = direction, cores = cores, label = direction ) ## ---- warning=FALSE----------------------------------------------------------- datatable( pair[1:10,], options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = TRUE ) ## ---- warning=FALSE, results="hide"------------------------------------------- #------------------------------------------------------------- # Step 3.3: Motif enrichment analysis on the selected probes | #------------------------------------------------------------- enriched.motif <- get.enriched.motif( data = mae, probes = pair$Probe, dir.out = dir.out, label = direction, pvalue = 1, # default is FDR < 0.05 min.incidence = 10, lower.OR = 1.1 ) # One of the output from the previous function is a file with the motif, OR and Number of probes # It will be used for plotting purposes motif.enrichment <- read.csv(paste0(dir.out,"/getMotif.",direction, ".motif.enrichment.csv")) ## ---- warning=FALSE----------------------------------------------------------- head(enriched.motif[names(enriched.motif)[1]]) ## probes in the given set that have the first motif. motif.enrichment %>% head %>% gt::gt() ## ---- warning=FALSE, results="hide"------------------------------------------- #------------------------------------------------------------- # Step 3.4: Identifying regulatory TFs | #------------------------------------------------------------- TF <- get.TFs( data = mae, group.col = group.col, group1 = group1, group2 = group2, mode = "supervised", enriched.motif = enriched.motif, dir.out = dir.out, cores = cores, save.plots = FALSE, diff.dir = direction, label = direction ) # One of the output from the previous function is a file with the raking of TF, # for each motif. It will be used for plotting purposes TF.meth.cor <- get(load(paste0(dir.out,"/getTF.",direction,".TFs.with.motif.pvalue.rda"))) ## ---- warning=FALSE----------------------------------------------------------- datatable( TF, options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE ) datatable( TF.meth.cor[1:10,1:6], options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = TRUE ) ## ---- echo=TRUE, message=FALSE,eval = FALSE, warnings=FALSE, results='asis',fig.height=6, fig.width=10---- # heatmapPairs( # data = mae, # group.col = group.col, # group1 = group1, # group2 = group2, # annotation.col = c("gender"), # pairs = pair, # filename = NULL # ) ## ---- echo=TRUE, message=FALSE, warnings=FALSE, results='asis',fig.height=7,fig.width=15---- motif.enrichment.plot( motif.enrichment = motif.enrichment, save = FALSE, significant = list(lowerOR = 1.1) ) # Filter motifs in the plot lowerOR > 1.3 ## ---- eval=FALSE, include=TRUE------------------------------------------------ # grid:TF.rank.plot(motif.pvalue=TF.meth.cor, motif=TF$motif[1], save=FALSE) ## ----results='asis', echo=FALSE, message=FALSE,warning=FALSE, fig.align='center',fig.height=8,fig.width=8---- gg <- TF.rank.plot(motif.pvalue = TF.meth.cor, motif=TF$motif[1], save=FALSE) grid::grid.draw(gg[[1]]) ## ----results='asis', echo=TRUE, message=FALSE, warning=FALSE------------------ png("TF.png",width = 800, height = 400) scatter.plot( data = mae, category = group.col, save = FALSE, lm_line = TRUE, byTF = list( TF = unlist(stringr::str_split(TF[1,"top_5percent_TFs"],";"))[1:4], probe = enriched.motif[[TF$motif[1]]] ) ) dev.off() ## ----------------------------------------------------------------------------- pander::pander(sessionInfo(), compact = FALSE)