1 + 2
## [1] 3
x = c(1, 2, 3)
1:3 # sequence of integers from 1 to 3
## [1] 1 2 3
x + c(4, 5, 6) # vectorized
## [1] 5 7 9
x + 4 # recycling
## [1] 5 6 7
Vectors
numeric()
, character()
, logical()
, integer()
, complex()
, …NA
: ‘not available’factor()
: values from restricted set of ‘levels’.Operations
==
, <
, <=
, >
, >=
, …|
(or), &
(and), !
(not)[
, e.g., x[c(2, 3)]
[<-
, e.g., x[c(1, 3)] = x[c(1, 3)]
is.na()
Functions
x = rnorm(100)
y = x + rnorm(100)
plot(x, y)
data.frame
df <- data.frame(Independent = x, Dependent = y)
head(df)
## Independent Dependent
## 1 1.5620127 2.257045
## 2 0.8373360 1.364762
## 3 0.6114359 1.888833
## 4 -0.2294786 1.107088
## 5 -0.2860413 2.180873
## 6 0.2643727 2.117244
df[1:5, 1:2]
## Independent Dependent
## 1 1.5620127 2.257045
## 2 0.8373360 1.364762
## 3 0.6114359 1.888833
## 4 -0.2294786 1.107088
## 5 -0.2860413 2.180873
df[1:5, ]
## Independent Dependent
## 1 1.5620127 2.257045
## 2 0.8373360 1.364762
## 3 0.6114359 1.888833
## 4 -0.2294786 1.107088
## 5 -0.2860413 2.180873
plot(Dependent ~ Independent, df) # 'formula' interface
df[, 1]
, df[, "Independent"]
, df[[1]]
,
df[["Independent"]]
, df$Independent
Exercise: plot only values with Dependent > 0
, Independent > 0
Select rows
ridx <- (df$Dependent > 0) & (df$Independent > 0)
Plot subset
plot(Dependent ~ Independent, df[ridx, ])
Skin the cat another way
plot(
Dependent ~ Independent, df,
subset = (Dependent > 0) & (Independent > 0)
)
fit <- lm(Dependent ~ Independent, df) # linear model -- regression
anova(fit) # summary table
## Analysis of Variance Table
##
## Response: Dependent
## Df Sum Sq Mean Sq F value Pr(>F)
## Independent 1 131.83 131.828 111.15 < 2.2e-16 ***
## Residuals 98 116.23 1.186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(Dependent ~ Independent, df)
abline(fit)
lm()
: plain-old functionfit
: an object of class “lm”anova()
: a generic with a specific method for class “lm”class(fit)
## [1] "lm"
methods(class="lm")
## [1] add1 alias anova case.names coerce
## [6] confint cooks.distance deviance dfbeta dfbetas
## [11] drop1 dummy.coef effects extractAIC family
## [16] formula hatvalues influence initialize kappa
## [21] labels logLik model.frame model.matrix nobs
## [26] plot predict print proj qr
## [31] residuals rstandard rstudent show simulate
## [36] slotsFromS3 summary variable.names vcov
## see '?methods' for accessing help and source code
?"plot" # plain-old-function or generic
?"plot.formula" # method
library(ggplot2)
ggplot(df) +
aes(x = Independent, y = Dependent) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
library(ggplot2)
, once per session)Started 2002 as a platform for understanding analysis of microarray data
More than 2,100 packages. Domains of expertise:
Important themes
Resources
Biostrings themes
length()
, [
, …reverseComplement()
library(Biostrings)
dna <- DNAStringSet(c("AAACTG", "CCCTTCAAC", "TACGAA"))
dna
## DNAStringSet object of length 3:
## width seq
## [1] 6 AAACTG
## [2] 9 CCCTTCAAC
## [3] 6 TACGAA
length(dna)
## [1] 3
dna[c(1, 3, 1)]
## DNAStringSet object of length 3:
## width seq
## [1] 6 AAACTG
## [2] 6 TACGAA
## [3] 6 AAACTG
width(dna)
## [1] 6 9 6
reverseComplement(dna)
## DNAStringSet object of length 3:
## width seq
## [1] 6 CAGTTT
## [2] 9 GTTGAAGGG
## [3] 6 TTCGTA
Help!
methods(class="DNAStringSet")
?"DNAStringSet"
browseVignettes(package="Biostrings")
rtracklayer::import.bed()
and
HelloRanges)length()
, [
, etc.library(GenomicRanges)
gr <- GRanges(c("chr1:100-120", "chr1:115-130", "chr2:200-220"))
gr
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 100-120 *
## [2] chr1 115-130 *
## [3] chr2 200-220 *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
Intra-range operations
shift()
, narrow()
, flank()
Inter-range operations
reduce()
, coverage()
, gaps()
, disjoin()
Between-range operations
countOverlaps()
, findOverlaps()
, summarizeOverlaps()
shift(gr, 1)
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 101-121 *
## [2] chr1 116-131 *
## [3] chr2 201-221 *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
reduce(gr)
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 100-130 *
## [2] chr2 200-220 *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
anno <- GRanges(c("chr1:110-150", "chr2:150-210"))
countOverlaps(anno, gr)
## [1] 2 1
Help!
methods(class="GRanges")
methods(class="GRangesList")
?"GRanges"
?"GRangesList"
browseVignettes(package="GenomicRanges")
Lists of Genomic Ranges
Component parts
counts <- read.csv("lecture-01-data/airway_counts.csv", row.names=1)
counts <- as.matrix(counts)
head(counts, 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 679 448 873 408 1138
## ENSG00000000419 467 515 621 365 587
## ENSG00000000457 260 211 263 164 245
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003 1047 770 572
## ENSG00000000419 799 417 508
## ENSG00000000457 331 233 229
colData <- read.csv("lecture-01-data/airway_colData.csv", row.names=1)
colData[, 1:4]
## SampleName cell dex albut
## SRR1039508 GSM1275862 N61311 untrt untrt
## SRR1039509 GSM1275863 N61311 trt untrt
## SRR1039512 GSM1275866 N052611 untrt untrt
## SRR1039513 GSM1275867 N052611 trt untrt
## SRR1039516 GSM1275870 N080611 untrt untrt
## SRR1039517 GSM1275871 N080611 trt untrt
## SRR1039520 GSM1275874 N061011 untrt untrt
## SRR1039521 GSM1275875 N061011 trt untrt
rowRanges <- readRDS("lecture-01-data/airway_rowRanges.rds")
rowRanges
## GRangesList object of length 33469:
## $ENSG00000000003
## GRanges object with 17 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] X 99883667-99884983 - | 667145 ENSE00001459322
## [2] X 99885756-99885863 - | 667146 ENSE00000868868
## [3] X 99887482-99887565 - | 667147 ENSE00000401072
## [4] X 99887538-99887565 - | 667148 ENSE00001849132
## [5] X 99888402-99888536 - | 667149 ENSE00003554016
## ... ... ... ... . ... ...
## [13] X 99890555-99890743 - | 667156 ENSE00003512331
## [14] X 99891188-99891686 - | 667158 ENSE00001886883
## [15] X 99891605-99891803 - | 667159 ENSE00001855382
## [16] X 99891790-99892101 - | 667160 ENSE00001863395
## [17] X 99894942-99894988 - | 667161 ENSE00001828996
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
##
## ...
## <33468 more elements>
Could manipulate independently…
cidx <- colData$dex == "trt"
log1p_counts <- log1p(counts)
plot(rowMeans(log1p_counts[,cidx]) ~ rowMeans(log1p_counts[,!cidx]))
colData
rows had been re-ordered?Solution: coordinate in a single object – SummarizedExperiment
library(SummarizedExperiment)
se <- SummarizedExperiment(
assays = list(counts = counts),
rowRanges = rowRanges, colData = colData
)
assay(se, "log1p_counts") <- log1p(assay(se, "counts"))
## manipulate rows and columns in a coordinated fashion...
dim()
, two-dimensional [
, …assay()
, rowData()
/ rowRanges()
, colData()
, …Help!
methods(class="SummarizedExperiment")
?"SummarizedExperiment"
browseVignettes(package="SummarizedExperiment")
Web site, https://bioconductor.org
Support site, https://support.bioconductor.org
2140 ‘software’ packages, https://bioconductor.org/packages
Discovery and use, e.g., DESeq2
Single-cell analysis
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] SummarizedExperiment_1.26.1 Biobase_2.56.0
## [3] MatrixGenerics_1.8.0 matrixStats_0.62.0
## [5] GenomicRanges_1.48.0 Biostrings_2.64.0
## [7] GenomeInfoDb_1.32.2 XVector_0.36.0
## [9] IRanges_2.30.0 S4Vectors_0.34.0
## [11] BiocGenerics_0.42.0 ggplot2_3.3.6
## [13] BiocStyle_2.24.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.8.3 lattice_0.20-45 assertthat_0.2.1
## [4] digest_0.6.29 utf8_1.2.2 R6_2.5.1
## [7] evaluate_0.15 highr_0.9 pillar_1.7.0
## [10] zlibbioc_1.42.0 rlang_1.0.2 jquerylib_0.1.4
## [13] magick_2.7.3 Matrix_1.4-1 rmarkdown_2.14
## [16] labeling_0.4.2 splines_4.2.0 stringr_1.4.0
## [19] RCurl_1.98-1.7 munsell_0.5.0 DelayedArray_0.22.0
## [22] compiler_4.2.0 xfun_0.31 pkgconfig_2.0.3
## [25] mgcv_1.8-40 htmltools_0.5.2 tidyselect_1.1.2
## [28] tibble_3.1.7 GenomeInfoDbData_1.2.8 bookdown_0.27
## [31] codetools_0.2-18 fansi_1.0.3 crayon_1.5.1
## [34] dplyr_1.0.9 withr_2.5.0 bitops_1.0-7
## [37] grid_4.2.0 nlme_3.1-157 jsonlite_1.8.0
## [40] gtable_0.3.0 lifecycle_1.0.1 DBI_1.1.2
## [43] magrittr_2.0.3 scales_1.2.0 cli_3.3.0
## [46] stringi_1.7.6 farver_2.1.0 bslib_0.3.1
## [49] ellipsis_0.3.2 generics_0.1.2 vctrs_0.4.1
## [52] tools_4.2.0 glue_1.6.2 purrr_0.3.4
## [55] fastmap_1.1.0 yaml_2.3.5 colorspace_2.0-3
## [58] BiocManager_1.30.18 knitr_1.39 sass_0.4.1
Research reported in this tutorial is supported by the National Human Genome Research Institute and the National Cancer Institute of the National Institutes of Health under award numbers U24HG004059 (Bioconductor), U24HG010263 (AnVIL) and U24CA180996 (ITCR).