Author: Martin Morgan (mtmorgan@fredhutch.org)
Date: 7 September, 2015
Back to Workshop Outline
The material in this document requires R version 3.2 and Bioconductor version 3.1
stopifnot(
getRversion() >= '3.2' && getRversion() < '3.3',
BiocInstaller::biocVersion() >= "3.1"
)
Notes
BAMSpector – display gene models and underlying support across BAM (aligned read) files
app <- system.file(package="BiocAsiaPacific2015", "BAMSpector")
shiny::runApp(app)
MAPlotExplorer – summarize two-group differential expression, including drill-down of individual genes. Based on CSAMA 2015 lab by Andrzej Oles.
app <- system.file(package="BiocAsiaPacific2015", "MAPlotExplorer")
shiny::runApp(app)
Annotation
Standard (large) file input & manipulation, e.g., BAM files of aligned reads
Statistical analysis
Key resources
web site: http://biocondcutor.org
support: https://support.bioconductor.org
factor()
, NA
logical
,
integer
, numeric
, complex
, character
, byte
matrix
– atomic vector with 'dim' attributedata.frame
– list of equal length atomic vectorslm()
, belowrnorm(1000)
print()
.print.factor
; methods are invoked indirectly, via the generic.abline()
used below is a plain-old-funciton.Iteration:
lapply()
args(lapply)
## function (X, FUN, ...)
## NULL
X
(typically a list()
), apply a
function FUN
to each vector element, returning the result as
a list. ...
are additional arguments to FUN
.FUN
can be built-in, or a user-defined functionlst <- list(a=1:2, b=2:4)
lapply(lst, log) # 'base' argument default; natural log
## $a
## [1] 0.0000000 0.6931472
##
## $b
## [1] 0.6931472 1.0986123 1.3862944
lapply(lst, log, 10) # '10' is second argument to 'log()', i.e., log base 10
## $a
## [1] 0.00000 0.30103
##
## $b
## [1] 0.3010300 0.4771213 0.6020600
sapply()
– like lapply()
, but simplify the result to a
vector, matrix, or array, if possible.vapply()
– like sapply()
, but requires that the return
type of FUN
is specified; this can be safer – an error when
the result is of an unexpected type.mapply()
(also Map()
)
args(mapply)
## function (FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE, USE.NAMES = TRUE)
## NULL
...
are one or more vectors, recycled to be of the same
length. FUN
is a function that takes as many arguments as
there are components of ...
. mapply
returns the result of
applying FUN
to the elements of the vectors in ...
.mapply(seq, 1:3, 4:6, SIMPLIFY=FALSE) # seq(1, 4); seq(2, 5); seq(3, 6)
## [[1]]
## [1] 1 2 3 4
##
## [[2]]
## [1] 2 3 4 5
##
## [[3]]
## [1] 3 4 5 6
apply()
args(apply)
## function (X, MARGIN, FUN, ...)
## NULL
X
, apply FUN
to each MARGIN
(dimension, e.g., MARGIN=1
means apply FUN
to each row,
MARGIN=2
means apply FUN
to each column)Traditional iteration programming constructs repeat {}
, for () {}
lapply()
!Conditional
if (test) {
## code if TEST == TRUE
} else {
## code if TEST == FALSE
}
Functions (see table below for a few favorites)
fun <- function(x) {
length(unique(x))
}
## list of length 5, each containsing a sample (with replacement) of letters
lets <- replicate(5, sample(letters, 50, TRUE), simplify=FALSE)
sapply(lets, fun)
## [1] 25 23 21 22 20
Introspection
class()
, str()
dim()
Help
?"print"
: help on the generic print?"print.data.frame"
: help on print method for objects of class
data.frame.help(package="GenomeInfoDb")
browseVignettes("GenomicRanges")
methods("plot")
methods(class="lm")
R vectors, vectorized operations, data.frame()
, formulas,
functions, objects, class and method discovery (introspection).
x <- rnorm(1000) # atomic vectors
y <- x + rnorm(1000, sd=.5)
df <- data.frame(x=x, y=y) # object of class 'data.frame'
plot(y ~ x, df) # generic plot, method plot.formula
fit <- lm(y ~x, df) # object of class 'lm'
methods(class=class(fit)) # introspection
## [1] add1 alias anova case.names coerce confint
## [7] cooks.distance deviance dfbeta dfbetas drop1 dummy.coef
## [13] effects extractAIC family formula hatvalues influence
## [19] initialize kappa labels logLik model.frame model.matrix
## [25] nobs plot predict print proj qr
## [31] residuals rstandard rstudent show simulate slotsFromS3
## [37] summary variable.names vcov
## see '?methods' for accessing help and source code
anova(fit)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 963.82 963.82 3649.1 < 2.2e-16 ***
## Residuals 998 263.60 0.26
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(y ~ x, df) # methods(plot); ?plot.formula
abline(fit, col="red", lwd=3, lty=2) # a function, not generic.method
Programming example – group 1000 SYMBOLs into GO identifiers
## example data
fl <- file.choose() ## symgo.csv
symgo <- read.csv(fl, row.names=1, stringsAsFactors=FALSE)
head(symgo)
## SYMBOL GO EVIDENCE ONTOLOGY
## 1 PPIAP28 <NA> <NA> <NA>
## 2 PTLAH <NA> <NA> <NA>
## 3 HIST1H2BC GO:0000786 NAS CC
## 4 HIST1H2BC GO:0000788 IBA CC
## 5 HIST1H2BC GO:0002227 IDA BP
## 6 HIST1H2BC GO:0003677 IBA MF
dim(symgo)
## [1] 5041 4
length(unique(symgo$SYMBOL))
## [1] 1000
## split-sapply
go2sym <- split(symgo$SYMBOL, symgo$GO)
len1 <- sapply(go2sym, length) # compare with lapply, vapply
## built-in functions for common actions
len2 <- lengths(go2sym)
identical(len1, len2)
## [1] TRUE
## smarter built-in functions, e.g., omiting NAs
len3 <- aggregate(SYMBOL ~ GO, symgo, length)
head(len3)
## GO SYMBOL
## 1 GO:0000049 3
## 2 GO:0000050 2
## 3 GO:0000060 1
## 4 GO:0000077 1
## 5 GO:0000086 3
## 6 GO:0000118 1
## more fun with aggregate()
head(aggregate(GO ~ SYMBOL, symgo, length))
## SYMBOL GO
## 1 ABCD4 15
## 2 ABCG2 22
## 3 ACE 57
## 4 ADAMTSL2 6
## 5 ALDH1L2 11
## 6 ALOX5 19
head(aggregate(SYMBOL ~ GO, symgo, c))
## GO SYMBOL
## 1 GO:0000049 YARS2, YARS2, EEF1A1
## 2 GO:0000050 ASL, ASL
## 3 GO:0000060 OPRD1
## 4 GO:0000077 PEA15
## 5 GO:0000086 TUBB4A, CENPF, CLASP1
## 6 GO:0000118 CIR1
## your own function -- unique, lower-case identifiers
uidfun <- function(x) {
unique(tolower(x))
}
head(aggregate(SYMBOL ~ GO , symgo, uidfun))
## GO SYMBOL
## 1 GO:0000049 yars2, eef1a1
## 2 GO:0000050 asl
## 3 GO:0000060 oprd1
## 4 GO:0000077 pea15
## 5 GO:0000086 tubb4a, cenpf, clasp1
## 6 GO:0000118 cir1
## as an 'anonymous' function
head(aggregate(SYMBOL ~ GO, symgo, function(x) {
unique(tolower(x))
}))
## GO SYMBOL
## 1 GO:0000049 yars2, eef1a1
## 2 GO:0000050 asl
## 3 GO:0000060 oprd1
## 4 GO:0000077 pea15
## 5 GO:0000086 tubb4a, cenpf, clasp1
## 6 GO:0000118 cir1
These case studies serve as refreshers on R input and manipulation of data.
Input a file that contains ALL (acute lymphoblastic leukemia) patient information
fname <- file.choose() ## "ALLphenoData.tsv"
stopifnot(file.exists(fname))
pdata <- read.delim(fname)
Check out the help page ?read.delim
for input options, and explore
basic properties of the object you've created, for instance…
class(pdata)
## [1] "data.frame"
colnames(pdata)
## [1] "id" "diagnosis" "sex" "age" "BT"
## [6] "remission" "CR" "date.cr" "t.4.11." "t.9.22."
## [11] "cyto.normal" "citog" "mol.biol" "fusion.protein" "mdr"
## [16] "kinet" "ccr" "relapse" "transplant" "f.u"
## [21] "date.last.seen"
dim(pdata)
## [1] 127 21
head(pdata)
## id diagnosis sex age BT remission CR date.cr t.4.11. t.9.22. cyto.normal citog
## 1 1005 5/21/1997 M 53 B2 CR CR 8/6/1997 FALSE TRUE FALSE t(9;22)
## 2 1010 3/29/2000 M 19 B2 CR CR 6/27/2000 FALSE FALSE FALSE simple alt.
## 3 3002 6/24/1998 F 52 B4 CR CR 8/17/1998 NA NA NA <NA>
## 4 4006 7/17/1997 M 38 B1 CR CR 9/8/1997 TRUE FALSE FALSE t(4;11)
## 5 4007 7/22/1997 M 57 B2 CR CR 9/17/1997 FALSE FALSE FALSE del(6q)
## 6 4008 7/30/1997 M 17 B1 CR CR 9/27/1997 FALSE FALSE FALSE complex alt.
## mol.biol fusion.protein mdr kinet ccr relapse transplant f.u date.last.seen
## 1 BCR/ABL p210 NEG dyploid FALSE FALSE TRUE BMT / DEATH IN CR <NA>
## 2 NEG <NA> POS dyploid FALSE TRUE FALSE REL 8/28/2000
## 3 BCR/ABL p190 NEG dyploid FALSE TRUE FALSE REL 10/15/1999
## 4 ALL1/AF4 <NA> NEG dyploid FALSE TRUE FALSE REL 1/23/1998
## 5 NEG <NA> NEG dyploid FALSE TRUE FALSE REL 11/4/1997
## 6 NEG <NA> NEG hyperd. FALSE TRUE FALSE REL 12/15/1997
summary(pdata$sex)
## F M NA's
## 42 83 2
summary(pdata$cyto.normal)
## Mode FALSE TRUE NA's
## logical 69 24 34
Remind yourselves about various ways to subset and access columns of a data.frame
pdata[1:5, 3:4]
## sex age
## 1 M 53
## 2 M 19
## 3 F 52
## 4 M 38
## 5 M 57
pdata[1:5, ]
## id diagnosis sex age BT remission CR date.cr t.4.11. t.9.22. cyto.normal citog mol.biol
## 1 1005 5/21/1997 M 53 B2 CR CR 8/6/1997 FALSE TRUE FALSE t(9;22) BCR/ABL
## 2 1010 3/29/2000 M 19 B2 CR CR 6/27/2000 FALSE FALSE FALSE simple alt. NEG
## 3 3002 6/24/1998 F 52 B4 CR CR 8/17/1998 NA NA NA <NA> BCR/ABL
## 4 4006 7/17/1997 M 38 B1 CR CR 9/8/1997 TRUE FALSE FALSE t(4;11) ALL1/AF4
## 5 4007 7/22/1997 M 57 B2 CR CR 9/17/1997 FALSE FALSE FALSE del(6q) NEG
## fusion.protein mdr kinet ccr relapse transplant f.u date.last.seen
## 1 p210 NEG dyploid FALSE FALSE TRUE BMT / DEATH IN CR <NA>
## 2 <NA> POS dyploid FALSE TRUE FALSE REL 8/28/2000
## 3 p190 NEG dyploid FALSE TRUE FALSE REL 10/15/1999
## 4 <NA> NEG dyploid FALSE TRUE FALSE REL 1/23/1998
## 5 <NA> NEG dyploid FALSE TRUE FALSE REL 11/4/1997
head(pdata[, 3:5])
## sex age BT
## 1 M 53 B2
## 2 M 19 B2
## 3 F 52 B4
## 4 M 38 B1
## 5 M 57 B2
## 6 M 17 B1
tail(pdata[, 3:5], 3)
## sex age BT
## 125 M 19 T2
## 126 M 30 T3
## 127 M 29 T2
head(pdata$age)
## [1] 53 19 52 38 57 17
head(pdata$sex)
## [1] M M F M M M
## Levels: F M
head(pdata[pdata$age > 21,])
## id diagnosis sex age BT remission CR date.cr t.4.11. t.9.22. cyto.normal citog
## 1 1005 5/21/1997 M 53 B2 CR CR 8/6/1997 FALSE TRUE FALSE t(9;22)
## 3 3002 6/24/1998 F 52 B4 CR CR 8/17/1998 NA NA NA <NA>
## 4 4006 7/17/1997 M 38 B1 CR CR 9/8/1997 TRUE FALSE FALSE t(4;11)
## 5 4007 7/22/1997 M 57 B2 CR CR 9/17/1997 FALSE FALSE FALSE del(6q)
## 10 8001 1/15/1997 M 40 B2 CR CR 3/26/1997 FALSE FALSE FALSE del(p15)
## 11 8011 8/21/1998 M 33 B3 CR CR 10/8/1998 FALSE FALSE FALSE del(p15/p16)
## mol.biol fusion.protein mdr kinet ccr relapse transplant f.u date.last.seen
## 1 BCR/ABL p210 NEG dyploid FALSE FALSE TRUE BMT / DEATH IN CR <NA>
## 3 BCR/ABL p190 NEG dyploid FALSE TRUE FALSE REL 10/15/1999
## 4 ALL1/AF4 <NA> NEG dyploid FALSE TRUE FALSE REL 1/23/1998
## 5 NEG <NA> NEG dyploid FALSE TRUE FALSE REL 11/4/1997
## 10 BCR/ABL p190 NEG <NA> FALSE TRUE FALSE REL 7/11/1997
## 11 BCR/ABL p190/p210 NEG dyploid FALSE FALSE TRUE BMT / DEATH IN CR <NA>
It seems from below that there are 17 females over 40 in the data set,
but when sub-setting pdata
to contain just those individuals 19 rows
are selected. Why? What can we do to correct this?
idx <- pdata$sex == "F" & pdata$age > 40
table(idx)
## idx
## FALSE TRUE
## 108 17
dim(pdata[idx,])
## [1] 19 21
Use the mol.biol
column to subset the data to contain just
individuals with 'BCR/ABL' or 'NEG', e.g.,
bcrabl <- pdata[pdata$mol.biol %in% c("BCR/ABL", "NEG"),]
The mol.biol
column is a factor, and retains all levels even after
subsetting. How might you drop the unused factor levels?
bcrabl$mol.biol <- factor(bcrabl$mol.biol)
The BT
column is a factor describing B- and T-cell subtypes
levels(bcrabl$BT)
## [1] "B" "B1" "B2" "B3" "B4" "T" "T1" "T2" "T3" "T4"
How might one collapse B1, B2, … to a single type B, and likewise for T1, T2, …, so there are only two subtypes, B and T
table(bcrabl$BT)
##
## B B1 B2 B3 B4 T T1 T2 T3 T4
## 4 9 35 22 9 4 1 15 9 2
levels(bcrabl$BT) <- substring(levels(bcrabl$BT), 1, 1)
table(bcrabl$BT)
##
## B T
## 79 31
Use xtabs()
(cross-tabulation) to count the number of samples with
B- and T-cell types in each of the BCR/ABL and NEG groups
xtabs(~ BT + mol.biol, bcrabl)
## mol.biol
## BT BCR/ABL NEG
## B 37 42
## T 0 31
Use aggregate()
to calculate the average age of males and females in
the BCR/ABL and NEG treatment groups.
aggregate(age ~ mol.biol + sex, bcrabl, mean)
## mol.biol sex age
## 1 BCR/ABL F 39.93750
## 2 NEG F 30.42105
## 3 BCR/ABL M 40.50000
## 4 NEG M 27.21154
Use t.test()
to compare the age of individuals in the BCR/ABL versus
NEG groups; visualize the results using boxplot()
. In both cases,
use the formula
interface. Consult the help page ?t.test
and re-do
the test assuming that variance of ages in the two groups is
identical. What parts of the test output change?
t.test(age ~ mol.biol, bcrabl)
##
## Welch Two Sample t-test
##
## data: age by mol.biol
## t = 4.8172, df = 68.529, p-value = 8.401e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 7.13507 17.22408
## sample estimates:
## mean in group BCR/ABL mean in group NEG
## 40.25000 28.07042
boxplot(age ~ mol.biol, bcrabl)