This vignette shows how to load TCGA multi-assay datasets and convert them to Bioconductor core objects (ExpressionSet, GRanges, and GRangesList). It uses the RTCGAToolbox Bioconductor library, but currently you must use the pre-devel version at:
library(BiocInstaller)
biocLite("LiNk-NY/RTCGAToolbox")
All fully-open data from TCGA that is accessible through the firehose_get command-line program.
You have to select a “run date” for processed data:
library(RTCGAToolbox)
(rundates <- getFirehoseRunningDates())
## [1] "20150402" "20150204" "20141206" "20141017" "20140902" "20140715"
## [7] "20140614" "20140518" "20140416" "20140316" "20140215" "20140115"
## [13] "20131210" "20131114" "20131010" "20130923" "20130809" "20130715"
## [19] "20130623" "20130606" "20130523" "20130508" "20130421" "20130406"
## [25] "20130326" "20130309" "20130222" "20130203" "20130116" "20121221"
## [31] "20121206" "20121114" "20121102" "20121024" "20121020" "20121018"
## [37] "20121004" "20120913" "20120825" "20120804" "20120725" "20120707"
## [43] "20120623" "20120606" "20120525" "20120515" "20120425" "20120412"
## [49] "20120321" "20120306" "20120217" "20120124" "20120110" "20111230"
## [55] "20111206" "20111128" "20111115" "20111026"
And an “analysis date” for analyzed data, such as GISTIC2 regions of recurrent copy number variation:
(analysisdates <- getFirehoseAnalyzeDates())
## [1] "20141017" "20140715" "20140416" "20140115" "20130923" "20130523"
## [7] "20130421" "20130326" "20130222" "20130116" "20121221" "20121024"
## [13] "20120913" "20120825" "20120725" "20120623" "20120525" "20120425"
## [19] "20120321" "20120217" "20120124" "20111230" "20111128" "20111026"
## [25] "20110921" "20110728" "20110525" "20110421" "20110327" "20110217"
## [31] "20110114" "20101223"
For exactly reproduceable results you would hard-code one of these, since data can change over time as samples are added or new algorithms are used for processing and analysis, but here will just use the most recent versions rundates[1] and analysisdates[1].
The following commands (not evaluated) will fetch data for five cancer types:
ov <- getFirehoseData("OV", runDate=rundates[1], gistic2_Date=analysisdates[1], RNAseq_Gene=TRUE,
miRNASeq_Gene=TRUE, RNAseq2_Gene_Norm=TRUE, CNA_SNP=TRUE, CNV_SNP=TRUE, CNA_Seq=TRUE,
CNA_CGH=TRUE, Methylation=TRUE, Mutation=TRUE, mRNA_Array=TRUE, miRNA_Array=TRUE,
RPPA=TRUE, todir = NULL)
gbm <- getFirehoseData("GBM", runDate=rundates[1], gistic2_Date=analysisdates[1], RNAseq_Gene=TRUE,
miRNASeq_Gene=TRUE, RNAseq2_Gene_Norm=TRUE, CNA_SNP=TRUE, CNV_SNP=TRUE, CNA_Seq=TRUE,
CNA_CGH=TRUE, Methylation=TRUE, Mutation=TRUE, mRNA_Array=TRUE, miRNA_Array = TRUE,
RPPA=TRUE, todir = NULL)
coad <- getFirehoseData("COAD", runDate=rundates[1], gistic2_Date=analysisdates[1], RNAseq_Gene=TRUE,
miRNASeq_Gene=TRUE, RNAseq2_Gene_Norm=TRUE, CNA_SNP = TRUE, CNV_SNP=TRUE, CNA_Seq = TRUE,
CNA_CGH = TRUE, Methylation = TRUE, Mutation = TRUE, mRNA_Array = TRUE, miRNA_Array = TRUE,
RPPA = TRUE, todir = NULL)
laml <- getFirehoseData("LAML", runDate=rundates[1], gistic2_Date=analysisdates[1], RNAseq_Gene=TRUE,
miRNASeq_Gene=TRUE, RNAseq2_Gene_Norm=TRUE, CNA_SNP = TRUE, CNV_SNP=TRUE, CNA_Seq = TRUE,
CNA_CGH = TRUE, Methylation = TRUE, Mutation = TRUE, mRNA_Array = TRUE, miRNA_Array = TRUE,
RPPA = TRUE, todir = NULL)
blca <- getFirehoseData("BLCA", runDate=rundates[1], gistic2_Date=analysisdates[1], RNAseq_Gene=TRUE,
miRNASeq_Gene=TRUE, RNAseq2_Gene_Norm=TRUE, CNA_SNP = TRUE, CNV_SNP=TRUE, CNA_Seq = TRUE,
CNA_CGH = TRUE, Methylation = TRUE, Mutation = TRUE, mRNA_Array = TRUE, miRNA_Array = TRUE,
RPPA = TRUE, todir = NULL)
Instead, we’ll just load the ovarian cancer dataset from the bioc2015multiomicsworkshop for demonstration:
library(bioc2015multiomicsworkshop)
## Loading required package: ozymandias
## Loading required package: CopyNumber450k
## Loading required package: minfi
## Loading required package: lattice
## Loading required package: SummarizedExperiment
## Loading required package: Biostrings
## Loading required package: XVector
## Loading required package: bumphunter
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: locfit
## locfit 1.5-9.1 2013-03-22
## Setting options('download.file.method.GEOquery'='auto')
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning in fun(libname, pkgname): error in rgl_init
## Loading required package: DNAcopy
## Loading required package: preprocessCore
## Warning: multiple methods tables found for 'plotPCA'
##
## Attaching package: 'CopyNumber450k'
##
## The following object is masked from 'package:DNAcopy':
##
## plotSample
##
## The following object is masked from 'package:bumphunter':
##
## getSegments
##
## The following object is masked from 'package:BiocGenerics':
##
## plotPCA
##
## Loading required package: ConsensusClusterPlus
## Loading required package: RPMM
## Loading required package: cluster
## Loading required package: mclust
## Package 'mclust' version 5.0.2
## Type 'citation("mclust")' for citing this R package in publications.
## Loading required package: survival
## Warning: replacing previous import by 'GenomicRanges::rowRanges' when
## loading 'ozymandias'
## Loading required package: artemis
## Loading required package: jsonlite
## Note: the specification for S3 class "AsIs" in package 'jsonlite' seems equivalent to one from package 'BiocGenerics': not turning on duplicate class definitions for this class.
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:utils':
##
## View
##
## Loading required package: erccdashboard
## Loading required package: ggplot2
## Loading required package: gridExtra
## Loading required package: ReactomePA
## Loading required package: DBI
##
##
## Attaching package: 'ReactomePA'
##
## The following object is masked from 'package:lattice':
##
## dotplot
##
## Loading required package: limma
##
## Attaching package: 'limma'
##
## The following object is masked from 'package:RPMM':
##
## ebayes
##
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
##
## Loading required package: EnrichmentBrowser
## Loading required package: GSEABase
## Loading required package: annotate
## Loading required package: AnnotationDbi
## Loading required package: XML
## Loading required package: graph
##
## Attaching package: 'graph'
##
## The following object is masked from 'package:XML':
##
## addNode
##
## The following object is masked from 'package:Biostrings':
##
## complement
##
## Loading required package: pathview
## Loading required package: KEGGgraph
##
## Attaching package: 'KEGGgraph'
##
## The following object is masked from 'package:graphics':
##
## plot
##
## Loading required package: org.Hs.eg.db
##
## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html.
##
## The pathview downloads and uses KEGG data. Academic users may freely use the
## KEGG website at http://www.kegg.jp/ or its mirror site at GenomeNet
## http://www.genome.jp/kegg/. Academic users may also freely link to the KEGG
## website. Non-academic users may use the KEGG website as end users for
## non-commercial purposes, but any other use requires a license agreement
## (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################
##
##
## Attaching package: 'EnrichmentBrowser'
##
## The following object is masked from 'package:BiocGenerics':
##
## normalize
##
## Loading required package: Homo.sapiens
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GO.db
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
## Now getting the GODb Object directly
## Now getting the OrgDb Object directly
## Now getting the TxDb Object directly
## Loading required package: Mus.musculus
## Loading required package: org.Mm.eg.db
##
## Loading required package: TxDb.Mmusculus.UCSC.mm10.knownGene
## Now getting the GODb Object directly
## Now getting the OrgDb Object directly
## Now getting the TxDb Object directly
## Loading required package: RUVSeq
## Loading required package: EDASeq
## Loading required package: ShortRead
## Loading required package: BiocParallel
## Loading required package: Rsamtools
## Loading required package: GenomicAlignments
##
## Attaching package: 'GenomicAlignments'
##
## The following objects are masked from 'package:locfit':
##
## left, right
##
## Note: the specification for S3 class "trellis" in package 'ShortRead' seems equivalent to one from package 'ggbio': not turning on duplicate class definitions for this class.
## Loading required package: edgeR
## Loading required package: ssizeRNA
## Loading required package: tools
##
## Attaching package: 'tools'
##
## The following object is masked from 'package:XML':
##
## toHTML
##
## Note: the specification for class "SimpleListAssays" in package 'artemis' seems equivalent to one from package 'SummarizedExperiment': not turning on duplicate class definitions for this class.
data(laml)
laml
## LAML FirehoseData object
## Standard data run date: 20150402
## Analyze running date: 20141017
## Available slots:
## @Clinical: A data frame, dim: 200 14
## @RNASeqGene: A matrix withraw read counts or normalized data, dim: 19990 179
## @RNASeq2GeneNorm: A matrix withraw read counts or normalized data, dim: 20501 173
## @CNASNP: A data.frame, dim: 874897 6
## @CNVSNP: A data.frame, dim: 28324 6
## @Methylation: A list contains FirehoseMethylationArray object(s), length: 1
## @Mutations: A data.frame, dim: 2585 64
What we really want are Bioconductor core data objects. The extract() function from RTCGAToolbox creates the appropriate object. If clinical=FALSE, it will create simple matrices and GRanges, if clinical=TRUE, it will create ExpressionSet and SummarizedExperiment objects.
Note that there is an unfortunate inconsistency in the the names of the assays between what we saw above in the show method above for the laml object, and in the arguments to getFirehoseData(). The extract() function uses the same data types as in the arguments to getFirehoseData(), with case and underscore-insensitive matching. The following choices are available:
choices <- tolower(gsub("_", "", c("RNAseq_Gene", "miRNASeq_Gene",
"RNAseq2_Gene_Norm", "CNA_SNP", "CNV_SNP", "CNA_Seq",
"CNA_CGH", "Methylation", "Mutation", "mRNA_Array",
"miRNA_Array", "RPPA")))
For example, for copy number we get a GRangesList:
cna <- extract(laml, "cnasnp", clinical=TRUE)
cna
## GRangesList object of length 392:
## $tcga-ab-2802-03
## GRanges object with 56357 ranges and 2 metadata columns:
## seqnames ranges strand | Num_Probes Segment_Mean
## <Rle> <IRanges> <Rle> | <numeric> <numeric>
## [1] chr1 [ 61735, 804456] * | 64 -0.0308
## [2] chr1 [ 809773, 1478153] * | 146 -2.8237
## [3] chr1 [1481049, 1709023] * | 45 -1.5591
## [4] chr1 [1709167, 1789449] * | 53 -0.6314
## [5] chr1 [1791259, 1796003] * | 5 0.6844
## ... ... ... ... ... ... ...
## [56353] chr24 [27190752, 27648670] * | 82 -0.3631
## [56354] chr24 [27651378, 28468768] * | 287 0.357
## [56355] chr24 [28470990, 28584494] * | 40 -0.8857
## [56356] chr24 [28588835, 28741069] * | 99 -0.2521
## [56357] chr24 [28741868, 59018259] * | 26 -1.2785
##
## ...
## <391 more elements>
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
And for RNAseq we get an ExpressionSet:
rnaseq <- extract(laml, "rnaseqgene", clinical=TRUE)
rnaseq
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 19990 features, 179 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: tcga-ab-2803-03 tcga-ab-2805-03 ... tcga-ab-3012-03
## (179 total)
## varLabels: vital_status days_to_death ... center (121 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
There is partial but not full overlap in the patients:
summary(names(cna) %in% sampleNames(rnaseq))
## Mode FALSE TRUE NA's
## logical 216 176 0
They have the same clinical data, albeit as a DataFrame for the GRangesList, and an AnnotatedDataFrame for the ExpressionSet:
elementMetadata(cna)[1:2, 1:4]
## DataFrame with 2 rows and 4 columns
## vital_status days_to_death days_to_last_followup
## <factor> <factor> <factor>
## tcga-ab-2802-03 1 365 NA
## tcga-ab-2802-11 1 365 NA
## primary_site_of_disease
## <factor>
## tcga-ab-2802-03 bone marrow
## tcga-ab-2802-11 bone marrow
pData(rnaseq)[1:2, 1:4]
## vital_status days_to_death days_to_last_followup
## tcga-ab-2803-03 1 792 <NA>
## tcga-ab-2805-03 1 577 <NA>
## primary_site_of_disease
## tcga-ab-2803-03 bone marrow
## tcga-ab-2805-03 bone marrow
Note that in the first two rows of the cna object, we have two different samples for the same patient (tcga-ab-2802 is the patient identifier, -03 and -11 are sample types). See the TCGA barcode information, and the Code Tables Report for an explanation of the sample types. 03 is “Primary Blood Derived Cancer - Peripheral Blood” and 11 is “Solid Tissue Normal”