## ---- fig.show='hold'---------------------------------------------------- library(TCGAWorkflowData) data("elmerExample") data("GBMnocnvhg19") data("GBMIllumina_HiSeq") data("LGGIllumina_HiSeq") data("met20SamplesGBMLGGchr9") data("mafMutect2LGGGBM") data("markersMatrix") data("histoneMarks") data("biogrid") data("genes_GR") ## ---- eval = FALSE, message=FALSE,warning=FALSE, include=TRUE------------ # library(GenomicRanges) # library(TCGAbiolinks) # ############################## # ## Recurrent CNV annotation ## # ############################## # # Get gene information from GENCODE using biomart # genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg19") # genes <- genes[genes$external_gene_id != "" & genes$chromosome_name %in% c(1:22,"X","Y"),] # genes[genes$chromosome_name == "X", "chromosome_name"] <- 23 # genes[genes$chromosome_name == "Y", "chromosome_name"] <- 24 # genes$chromosome_name <- sapply(genes$chromosome_name,as.integer) # genes <- genes[order(genes$start_position),] # genes <- genes[order(genes$chromosome_name),] # genes <- genes[,c("external_gene_id", "chromosome_name", "start_position","end_position")] # colnames(genes) <- c("GeneSymbol","Chr","Start","End") # genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE) # save(genes_GR,genes,file = "genes_GR.rda", compress = "xz") ## ---- eval=FALSE, message=FALSE, warning=FALSE, include=TRUE------------- # library(RTCGAToolbox) # # Download GISTIC results # lastAnalyseDate <- getFirehoseAnalyzeDates(1) # gistic <- getFirehoseData("GBM",gistic2_Date = lastAnalyseDate) # # # get GISTIC results # gistic.allbygene <- getData(gistic,type = "GISTIC", CN = "All") # gistic.allbygene <- gistic.allbygene[1:10,] # gistic.thresholedbygene <- getData(gistic,type = "GISTIC", CN = "Thresholed") # gistic.thresholedbygene <- gistic.thresholedbygene[1:10,] # save(gistic.allbygene,gistic.thresholedbygene,file = "GBMGistic.rda", compress = "xz") ## ---- eval=FALSE, include=TRUE, results='asis'--------------------------- # library(TCGAbiolinks) # query.gbm.nocnv <- GDCquery(project = "TCGA-GBM", # data.category = "Copy number variation", # legacy = TRUE, # file.type = "nocnv_hg19.seg", # sample.type = c("Primary solid Tumor")) # # to reduce time we will select only 20 samples # query.gbm.nocnv$results[[1]] <- query.gbm.nocnv$results[[1]][1:20,] # # GDCdownload(query.gbm.nocnv, chunks.per.download = 100) # # gbm.nocnv <- GDCprepare(query.gbm.nocnv, save = TRUE, save.filename = "GBMnocnvhg19.rda") ## ---- eval=FALSE, include=TRUE, results='asis'--------------------------- # query <- GDCquery(project = "TCGA-GBM", # data.category = "Gene expression", # data.type = "Gene expression quantification", # platform = "Illumina HiSeq", # file.type = "results", # sample.type = c("Primary solid Tumor"), # legacy = TRUE) # # We will use only 20 samples to make the example faster # query$results[[1]] <- query$results[[1]][1:20,] # GDCdownload(query) # gbm.exp <- GDCprepare(query, save = TRUE, summarizedExperiment = TRUE, save.filename = "GBMIllumina_HiSeq.rda") # # query <- GDCquery(project = "TCGA-LGG", # data.category = "Gene expression", # data.type = "Gene expression quantification", # platform = "Illumina HiSeq", # file.type = "results", # sample.type = c("Primary solid Tumor"), # legacy = TRUE) # # We will use only 20 samples to make the example faster # query$results[[1]] <- query$results[[1]][1:20,] # GDCdownload(query) # lgg.exp <- GDCprepare(query, save = TRUE, summarizedExperiment = TRUE, save.filename = "LGGIllumina_HiSeq.rda") ## ---- eval=FALSE, include=TRUE, results='asis'--------------------------- # #----------- 8.3 Identification of Regulatory Enhancers ------- # library(TCGAbiolinks) # # Samples: primary solid tumor w/ DNA methylation and gene expression # matched_met_exp <- function(project, n = NULL){ # # get primary solid tumor samples: DNA methylation # message("Download DNA methylation information") # met450k <- GDCquery(project = project, # data.category = "DNA methylation", # platform = "Illumina Human Methylation 450", # legacy = TRUE, # sample.type = c("Primary solid Tumor")) # met450k.tp <- met450k$results[[1]]$cases # # # get primary solid tumor samples: RNAseq # message("Download gene expression information") # exp <- GDCquery(project = project, # data.category = "Gene expression", # data.type = "Gene expression quantification", # platform = "Illumina HiSeq", # file.type = "normalized_results", # sample.type = c("Primary solid Tumor"), # legacy = TRUE) # exp.tp <- exp$results[[1]]$cases # # Get patients with samples in both platforms # patients <- unique(substr(exp.tp,1,15)[substr(exp.tp,1,12) %in% substr(met450k.tp,1,12)] ) # if(!is.null(n)) patients <- patients[1:n] # get only n samples # return(patients) # } # lgg.samples <- matched_met_exp("TCGA-LGG", n = 10) # gbm.samples <- matched_met_exp("TCGA-GBM", n = 10) # samples <- c(lgg.samples,gbm.samples) # # #----------------------------------- # # 1 - Methylation # # ---------------------------------- # query.met <- GDCquery(project = c("TCGA-LGG","TCGA-GBM"), # data.category = "DNA methylation", # platform = "Illumina Human Methylation 450", # legacy = TRUE, # barcode = samples) # GDCdownload(query.met) # met <- GDCprepare(query.met, save = FALSE) # met <- subset(met,subset = as.character(GenomicRanges::seqnames(met)) %in% c("chr9")) # save("met.20.samples.GBM.LGG.chr9.rda") # # met.elmer <- TCGAprepare_elmer(met, platform = "HumanMethylation450") # # #----------------------------------- # # 2 - Expression # # ---------------------------------- # query.exp <- GDCquery(project = c("TCGA-LGG","TCGA-GBM"), # data.category = "Gene expression", # data.type = "Gene expression quantification", # platform = "Illumina HiSeq", # file.type = "normalized_results", # legacy = TRUE, # barcode = samples) # GDCdownload(query.exp) # exp.lgg <- GDCprepare(query.exp, save = FALSE) # exp.elmer <- TCGAprepare_elmer(exp.lgg, platform = "IlluminaHiSeq_RNASeqV2") # save(exp.elmer, met.elmer, gbm.samples, lgg.samples, file = "elmerExample.rda", compress = "xz") ## ---- eval=FALSE, include=TRUE, results='asis'--------------------------- # library(TCGAbiolinks) # LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2") # GBMmut <- GDCquery_Maf(tumor = "GBM", pipelines = "mutect2") # mut <- plyr::rbind.fill(LGGmut, GBMmut) # save(mut,LGGmut,GBMmut,file ="mafMutect2LGGGBM.rda") ## ---- eval=FALSE, include=TRUE, results='asis'--------------------------- # gdac.root <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/" # file <- paste0(gdac.root, "genome.info.6.0_hg19.na31_minus_frequent_nan_probes_sorted_2.1.txt") # # Retrieve probes meta file from broadinstitute website # if(!file.exists(basename(file))) downloader::download(file, basename(file)) # # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==---=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==-- # # For hg38 analysis please take a look on: # # https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files # # File: SNP6 GRCh38 Liftover Probeset File for Copy Number Variation Analysis # # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==---=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==-- # markersMatrix <- readr::read_tsv(basename(file), col_names = FALSE, col_types = "ccn", progress = FALSE) # save(markersMatrix, file = "markersMatrix.rda", compress = "xz") ## ---- eval=FALSE, include=TRUE, results='asis'--------------------------- # ### read biogrid info # ### Check last version in https://thebiogrid.org/download.php # file <- "http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.146/BIOGRID-ALL-3.4.146.tab2.zip" # if(!file.exists(gsub("zip","txt",basename(file)))){ # downloader::download(file,basename(file)) # unzip(basename(file),junkpaths =TRUE) # } # # tmp.biogrid <- read.csv(gsub("zip","txt",basename(file)), header=TRUE, sep="\t", stringsAsFactors=FALSE) # save(tmp.biogrid, file = "biogrid.rda", compress = "xz") ## ----results='hide', eval=FALSE, echo=FALSE, message=FALSE,warning=FALSE---- # library(ChIPseeker) # library(AnnotationHub) # library(pbapply) # library(ggplot2) # #------------------ Working with ChipSeq data --------------- # # Step 1: download histone marks for a brain and non-brain samples. # #------------------------------------------------------------ # # loading annotation hub database # ah = AnnotationHub() # # # Searching for brain consolidated epigenomes in the roadmap database # bpChipEpi_brain <- query(ah , c("EpigenomeRoadMap", "narrowPeak", "chip", "consolidated","brain","E068")) # # # Get chip-seq data # histone.marks <- pblapply(names(bpChipEpi_brain), function(x) {ah[[x]]}) # names(histone.marks) <- names(bpChipEpi_brain) # save(histone.marks, file = "histoneMarks.rda", compress = "xz") ## ----sessionInfo, results='asis', echo=FALSE----------------------------- pander::pander(sessionInfo(), compact = FALSE)