These notes were created during the course, and server as a transcript of topics covered.
Workflow
GenomicFeatures::summarizeOverlaps()
)View from the Linux command line…
zcat *fastq.gz | less
samtools view -h *bam
… or within R / Bioconductor: fastq files
library(ShortRead)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unlist, unsplit
##
## Loading required package: BiocParallel
## Loading required package: Biostrings
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
strm = FastqStreamer("bigdata/SRR1039508_1.fastq.gz", 100000)
fq = yield(strm)
fq
## class: ShortReadQ
## length: 100000 reads; width: 63 cycles
sread(fq)
## A DNAStringSet instance of length 100000
## width seq
## [1] 63 CATTGCTGATACCAANNNNNNNNGCATTC...GTCTTCCTCCTTCCCTTACGGAATTACA
## [2] 63 CCCTGGACTGCTTCTTGAAAAGTGCCATC...CTATCTTTGGGGAGAGTATGATAGAGAT
## [3] 63 TCGATCCATCGATTGGAAGGCACTGATCT...TCAGGTTGGTGGTCTTATTTGCAAGTCC
## [4] 63 GAAGAGTTAGCAGCGACCGTGACAGACCA...GCTCCCAACTCCAGGGTGCCAATCCGAT
## [5] 63 CGTGCAGGAGATCATGATCCCCGCGGGCA...GCCTGGTCATTGGCAAGGGCGGGGAGAC
## ... ... ...
## [99996] 63 GAGAGAAGCTTTGTATGGCTGTCATGCTT...TGATTCCTGCAACTTGACCTTCAGGCTG
## [99997] 63 TTATGGTGCAGACATGGCCAAGTCCAAGA...CCACACACAACCAGTCCCGAAAATGGCA
## [99998] 63 TTAAAGTAGAGCATCTAGTTTGAGAAATA...AATTATTAAAGATGTCTTTTTTCTACCC
## [99999] 63 TCCCAACTGTAGGCTGAGTGACCTGAAGG...AGACTGCCGAAGTCCAAAAGCTTCAGCA
## [100000] 63 GTGTTTTCTGGTATCGTCCCTTCGTGGTT...AAAAAATGGTACTGGAAAGGGGTCCCAA
quality(fq)
## class: FastqQuality
## quality:
## A BStringSet instance of length 100000
## width seq
## [1] 63 HJJJJJJJJJJJJJJ########00?GHI...JIJJJJJJJJJJJJJJJJJHHHFFFFFD
## [2] 63 HJJJJJJJJJJJJJJJJJIIJIGHIJJJJ...JJJJJJJJJJJJGHHIDHIJJHHHHHHF
## [3] 63 HJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...GHIJJBGIJCGIAHIJHHHHHHHFFFFF
## [4] 63 HIJJJJIIJJJJJJJJJJJIJJJJJJJJJ...IHHHHHHFFFFEEEEDC@DDDDDDDDDD
## [5] 63 HIGGIIIIIIIGHIIIGIHIIIIJGIFAC...@@DDBDDCCDECCDDDB?BBBBBD@B;<
## ... ... ...
## [99996] 63 HJJJJJJJJJJJJGIJJJJJJGGIJJGHH...CHJJJGGHIJJJJJIJJJJJJJJIHHHH
## [99997] 63 HJJJIJHHIIJJJJIJJJJJIJIJJIJJI...HHFFFFDDDDDDDDCDDDDD@DDDDDDD
## [99998] 63 HJJJJJJHIJJJJJJJJJJJJJIJJJJJJ...JJJJJJJJJJJJJJJJJJJJJJJJJJIJ
## [99999] 63 HJJJJJJJJHIJJJJJJJGHIJJJJJJJJ...JJJJJJJJJJJJIJJJJHHHHFFFFFFF
## [100000] 63 HAEFHIJJJJJHIJJJJJJJJJJIHIJFH...IJJJJJIJHHHHHHFFFFFEDD>BDDDD
x = rnorm(1000)
y = x + rnorm(1000, sd=.5)
df = data.frame(x=x, y=y)
plot(y ~ x, df)
fit = lm(y ~ x, df)
class(fit)
## [1] "lm"
methods(class=class(fit))
## [1] add1 alias anova case.names
## [5] coerce confint cooks.distance deviance
## [9] dfbeta dfbetas drop1 dummy.coef
## [13] effects extractAIC family formula
## [17] hatvalues influence initialize kappa
## [21] labels logLik model.frame model.matrix
## [25] nobs plot predict print
## [29] proj qr residuals rstandard
## [33] rstudent show simulate slotsFromS3
## [37] summary variable.names vcov
## see '?methods' for accessing help and source code
methods("anova")
## [1] anova.glm* anova.glmlist* anova.lm* anova.lmlist*
## [5] anova.loess* anova.mlm* anova.nls*
## see '?methods' for accessing help and source code
Help!
?log
?plot # generic 'plot'
?plot.lm # plot for objects of class 'lm'
Extensive use of ‘S4’ classes
fit
(from lm()
) is an example of an S3 classsread(fq)
returned a DNAStringSet, an example of an S4 classlibrary(ShortRead)
strm = FastqStreamer("bigdata/SRR1039508_1.fastq.gz", 100000)
fq = yield(strm) # 'ShortReadQ' S4 class
class(fq) # introspection
## [1] "ShortReadQ"
## attr(,"package")
## [1] "ShortRead"
methods(class=class(fq))
## [1] [ [<- alphabetByCycle
## [4] alphabetScore append clean
## [7] coerce detail dustyScore
## [10] id length narrow
## [13] pairwiseAlignment qa renew
## [16] renewable reverse reverseComplement
## [19] show srdistance srduplicated
## [22] sread srorder srrank
## [25] srsort tables trimEnds
## [28] trimLRPatterns trimTails trimTailw
## [31] width writeFasta writeFastq
## see '?methods' for accessing help and source code
reads = sread(fq) # accessor -- get the reads
reads # 'DNAStringSet' S4 class
## A DNAStringSet instance of length 100000
## width seq
## [1] 63 CATTGCTGATACCAANNNNNNNNGCATTC...GTCTTCCTCCTTCCCTTACGGAATTACA
## [2] 63 CCCTGGACTGCTTCTTGAAAAGTGCCATC...CTATCTTTGGGGAGAGTATGATAGAGAT
## [3] 63 TCGATCCATCGATTGGAAGGCACTGATCT...TCAGGTTGGTGGTCTTATTTGCAAGTCC
## [4] 63 GAAGAGTTAGCAGCGACCGTGACAGACCA...GCTCCCAACTCCAGGGTGCCAATCCGAT
## [5] 63 CGTGCAGGAGATCATGATCCCCGCGGGCA...GCCTGGTCATTGGCAAGGGCGGGGAGAC
## ... ... ...
## [99996] 63 GAGAGAAGCTTTGTATGGCTGTCATGCTT...TGATTCCTGCAACTTGACCTTCAGGCTG
## [99997] 63 TTATGGTGCAGACATGGCCAAGTCCAAGA...CCACACACAACCAGTCCCGAAAATGGCA
## [99998] 63 TTAAAGTAGAGCATCTAGTTTGAGAAATA...AATTATTAAAGATGTCTTTTTTCTACCC
## [99999] 63 TCCCAACTGTAGGCTGAGTGACCTGAAGG...AGACTGCCGAAGTCCAAAAGCTTCAGCA
## [100000] 63 GTGTTTTCTGGTATCGTCCCTTCGTGGTT...AAAAAATGGTACTGGAAAGGGGTCCCAA
methods(class=class(reads))
## [1] ! !=
## [3] [ [[
## [5] [[<- [<-
## [7] %in% <
## [9] <= ==
## [11] > >=
## [13] $ $<-
## [15] aggregate alphabetFrequency
## [17] anyNA append
## [19] as.character as.complex
## [21] as.data.frame as.env
## [23] as.integer as.list
## [25] as.logical as.matrix
## [27] as.numeric as.raw
## [29] as.vector c
## [31] chartr clean
## [33] coerce compact
## [35] compare compareStrings
## [37] complement consensusMatrix
## [39] consensusString countOverlaps
## [41] countPattern countPDict
## [43] dinucleotideFrequencyTest do.call
## [45] droplevels duplicated
## [47] dustyScore elementLengths
## [49] elementMetadata elementMetadata<-
## [51] elementType endoapply
## [53] eval expand
## [55] extractAt extractROWS
## [57] Filter Find
## [59] findOverlaps hasOnlyBaseLetters
## [61] head high2low
## [63] ifelse intersect
## [65] is.na is.unsorted
## [67] isEmpty isMatchingEndingAt
## [69] isMatchingStartingAt lapply
## [71] length lengths
## [73] letterFrequency Map
## [75] match
## [ reached getOption("max.print") -- omitted 102 entries ]
## see '?methods' for accessing help and source code
gc = letterFrequency(reads, "GC", as.prob=TRUE)
hist(gc)
Help!
?DNAStringSet # class, and often frequently used methods
?letterFrequency # generic
methods("letterFrequency")
?"letterFrequency,XStringSet-method"
Key software packages…
import()
to import BED, WIG, GFF, GTF, …, files… and classes
assays()
rowRanges()
for annotations on rowscolData()
for column annotationsAnnotation
org.*
packagesTxDb.*
packagesBSgenome.*
packagesStrategies for working with big data
FastqStreamer()
, Rsamtools::BamFile(..., yieldSize=1000000)
; GenomicFiles::reduceByYield()
(see examples on ?reduceByYield
)All material on the course materials page