1 R data manipulation

This case study servers as a refresher / tutorial on basic input and manipulation of data.

Input a file that contains ALL (acute lymphoblastic leukemia) patient information

fname <- file.choose()   ## "ALLphenoData.tsv"
stopifnot(file.exists(fname))
pdata <- read.delim(fname)

Check out the help page ?read.delim for input options, and explore basic properties of the object you’ve created, for instance…

class(pdata)

## [1] "data.frame"

colnames(pdata)

##  [1] "id"             "diagnosis"      "sex"            "age"            "BT"            
##  [6] "remission"      "CR"             "date.cr"        "t.4.11."        "t.9.22."       
## [11] "cyto.normal"    "citog"          "mol.biol"       "fusion.protein" "mdr"           
## [16] "kinet"          "ccr"            "relapse"        "transplant"     "f.u"           
## [21] "date.last.seen"

dim(pdata)

## [1] 127  21

head(pdata)

##     id diagnosis sex age BT remission CR   date.cr t.4.11. t.9.22. cyto.normal        citog
## 1 1005 5/21/1997   M  53 B2        CR CR  8/6/1997   FALSE    TRUE       FALSE      t(9;22)
## 2 1010 3/29/2000   M  19 B2        CR CR 6/27/2000   FALSE   FALSE       FALSE  simple alt.
## 3 3002 6/24/1998   F  52 B4        CR CR 8/17/1998      NA      NA          NA         <NA>
## 4 4006 7/17/1997   M  38 B1        CR CR  9/8/1997    TRUE   FALSE       FALSE      t(4;11)
## 5 4007 7/22/1997   M  57 B2        CR CR 9/17/1997   FALSE   FALSE       FALSE      del(6q)
## 6 4008 7/30/1997   M  17 B1        CR CR 9/27/1997   FALSE   FALSE       FALSE complex alt.
##   mol.biol fusion.protein mdr   kinet   ccr relapse transplant               f.u date.last.seen
## 1  BCR/ABL           p210 NEG dyploid FALSE   FALSE       TRUE BMT / DEATH IN CR           <NA>
## 2      NEG           <NA> POS dyploid FALSE    TRUE      FALSE               REL      8/28/2000
## 3  BCR/ABL           p190 NEG dyploid FALSE    TRUE      FALSE               REL     10/15/1999
## 4 ALL1/AF4           <NA> NEG dyploid FALSE    TRUE      FALSE               REL      1/23/1998
## 5      NEG           <NA> NEG dyploid FALSE    TRUE      FALSE               REL      11/4/1997
## 6      NEG           <NA> NEG hyperd. FALSE    TRUE      FALSE               REL     12/15/1997

summary(pdata$sex)

##    F    M NA's 
##   42   83    2

summary(pdata$cyto.normal)

##    Mode   FALSE    TRUE    NA's 
## logical      69      24      34

Remind yourselves about various ways to subset and access columns of a data.frame

pdata[1:5, 3:4]

##   sex age
## 1   M  53
## 2   M  19
## 3   F  52
## 4   M  38
## 5   M  57

pdata[1:5, ]

##     id diagnosis sex age BT remission CR   date.cr t.4.11. t.9.22. cyto.normal       citog mol.biol
## 1 1005 5/21/1997   M  53 B2        CR CR  8/6/1997   FALSE    TRUE       FALSE     t(9;22)  BCR/ABL
## 2 1010 3/29/2000   M  19 B2        CR CR 6/27/2000   FALSE   FALSE       FALSE simple alt.      NEG
## 3 3002 6/24/1998   F  52 B4        CR CR 8/17/1998      NA      NA          NA        <NA>  BCR/ABL
## 4 4006 7/17/1997   M  38 B1        CR CR  9/8/1997    TRUE   FALSE       FALSE     t(4;11) ALL1/AF4
## 5 4007 7/22/1997   M  57 B2        CR CR 9/17/1997   FALSE   FALSE       FALSE     del(6q)      NEG
##   fusion.protein mdr   kinet   ccr relapse transplant               f.u date.last.seen
## 1           p210 NEG dyploid FALSE   FALSE       TRUE BMT / DEATH IN CR           <NA>
## 2           <NA> POS dyploid FALSE    TRUE      FALSE               REL      8/28/2000
## 3           p190 NEG dyploid FALSE    TRUE      FALSE               REL     10/15/1999
## 4           <NA> NEG dyploid FALSE    TRUE      FALSE               REL      1/23/1998
## 5           <NA> NEG dyploid FALSE    TRUE      FALSE               REL      11/4/1997

head(pdata[, 3:5])

##   sex age BT
## 1   M  53 B2
## 2   M  19 B2
## 3   F  52 B4
## 4   M  38 B1
## 5   M  57 B2
## 6   M  17 B1

tail(pdata[, 3:5], 3)

##     sex age BT
## 125   M  19 T2
## 126   M  30 T3
## 127   M  29 T2

head(pdata$age)

## [1] 53 19 52 38 57 17

head(pdata$sex)

## [1] M M F M M M
## Levels: F M

head(pdata[pdata$age > 21,])

##      id diagnosis sex age BT remission CR   date.cr t.4.11. t.9.22. cyto.normal        citog
## 1  1005 5/21/1997   M  53 B2        CR CR  8/6/1997   FALSE    TRUE       FALSE      t(9;22)
## 3  3002 6/24/1998   F  52 B4        CR CR 8/17/1998      NA      NA          NA         <NA>
## 4  4006 7/17/1997   M  38 B1        CR CR  9/8/1997    TRUE   FALSE       FALSE      t(4;11)
## 5  4007 7/22/1997   M  57 B2        CR CR 9/17/1997   FALSE   FALSE       FALSE      del(6q)
## 10 8001 1/15/1997   M  40 B2        CR CR 3/26/1997   FALSE   FALSE       FALSE     del(p15)
## 11 8011 8/21/1998   M  33 B3        CR CR 10/8/1998   FALSE   FALSE       FALSE del(p15/p16)
##    mol.biol fusion.protein mdr   kinet   ccr relapse transplant               f.u date.last.seen
## 1   BCR/ABL           p210 NEG dyploid FALSE   FALSE       TRUE BMT / DEATH IN CR           <NA>
## 3   BCR/ABL           p190 NEG dyploid FALSE    TRUE      FALSE               REL     10/15/1999
## 4  ALL1/AF4           <NA> NEG dyploid FALSE    TRUE      FALSE               REL      1/23/1998
## 5       NEG           <NA> NEG dyploid FALSE    TRUE      FALSE               REL      11/4/1997
## 10  BCR/ABL           p190 NEG    <NA> FALSE    TRUE      FALSE               REL      7/11/1997
## 11  BCR/ABL      p190/p210 NEG dyploid FALSE   FALSE       TRUE BMT / DEATH IN CR           <NA>

It seems from below that there are 17 females over 40 in the data set, but when sub-setting pdata to contain just those individuals 19 rows are selected. Why? What can we do to correct this?

idx <- pdata$sex == "F" & pdata$age > 40
table(idx)

## idx
## FALSE  TRUE 
##   108    17

dim(pdata[idx,])

## [1] 19 21

Use the mol.biol column to subset the data to contain just individuals with ‘BCR/ABL’ or ‘NEG’, e.g.,

bcrabl <- pdata[pdata$mol.biol %in% c("BCR/ABL", "NEG"),]

The mol.biol column is a factor, and retains all levels even after subsetting. How might you drop the unused factor levels?

bcrabl$mol.biol <- factor(bcrabl$mol.biol)

The BT column is a factor describing B- and T-cell subtypes

levels(bcrabl$BT)

##  [1] "B"  "B1" "B2" "B3" "B4" "T"  "T1" "T2" "T3" "T4"

How might one collapse B1, B2, … to a single type B, and likewise for T1, T2, …, so there are only two subtypes, B and T

table(bcrabl$BT)

## 
##  B B1 B2 B3 B4  T T1 T2 T3 T4 
##  4  9 35 22  9  4  1 15  9  2

levels(bcrabl$BT) <- substring(levels(bcrabl$BT), 1, 1)
table(bcrabl$BT)

## 
##  B  T 
## 79 31

Use xtabs() (cross-tabulation) to count the number of samples with B- and T-cell types in each of the BCR/ABL and NEG groups

xtabs(~ BT + mol.biol, bcrabl)

##    mol.biol
## BT  BCR/ABL NEG
##   B      37  42
##   T       0  31

Use aggregate() to calculate the average age of males and females in the BCR/ABL and NEG treatment groups.

aggregate(age ~ mol.biol + sex, bcrabl, mean)

##   mol.biol sex      age
## 1  BCR/ABL   F 39.93750
## 2      NEG   F 30.42105
## 3  BCR/ABL   M 40.50000
## 4      NEG   M 27.21154

Use t.test() to compare the age of individuals in the BCR/ABL versus NEG groups; visualize the results using boxplot(). In both cases, use the formula interface. Consult the help page ?t.test and re-do the test assuming that variance of ages in the two groups is identical. What parts of the test output change?

t.test(age ~ mol.biol, bcrabl)

## 
##  Welch Two Sample t-test
## 
## data:  age by mol.biol
## t = 4.8172, df = 68.529, p-value = 8.401e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   7.13507 17.22408
## sample estimates:
## mean in group BCR/ABL     mean in group NEG 
##              40.25000              28.07042

boxplot(age ~ mol.biol, bcrabl)

2 Short read quality assessment

Option 1: fastqc

Start fastqc
Select fastq.gz files from the File –> Open menu. Files are in /mnt/nfs/practicals/day1/martin_morgan/
Press OK
Study plots and the Help -> Contents menu

Option 2: ShortRead

## 1. attach ShortRead and BiocParallel
library(ShortRead)

## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap,
##     parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind, colnames, do.call,
##     duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted,
##     lapply, lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort, table,
##     tapply, union, unique, unlist, unsplit
## 
## Loading required package: BiocParallel
## Loading required package: Biostrings
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with 'browseVignettes()'. To cite
##     Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'.

library(BiocParallel)

## 2. create a vector of file paths
## replace 'bigdata' with '/mnt/nfs/practicals/day1/martin_morgan/'
fls <- dir("bigdata", pattern="*fastq.gz", full=TRUE)
stopifnot(all(file.exists(fls)))

## 3. collect statistics
stats <- qa(fls)

## 4. generate and browse the report
browseURL(report(stats))

Check out the qa report from all lanes

## replace 'bigdata' with '/mnt/nfs/practicals/day1/martin_morgan/'
load("bigdata/qa_all.Rda")
browseURL(report(qa_all))

3 Annotation

org packages

symbol mapping

library(airway)
data(airway)
library(org.Hs.eg.db)

## Loading required package: AnnotationDbi
## Loading required package: DBI

ensid <- head(rownames(airway))
mapIds(org.Hs.eg.db, ensid, "SYMBOL", "ENSEMBL")

## ENSG00000000003 ENSG00000000005 ENSG00000000419 ENSG00000000457 ENSG00000000460 ENSG00000000938 
##        "TSPAN6"          "TNMD"          "DPM1"         "SCYL3"      "C1orf112"           "FGR"

keytypes(org.Hs.eg.db)

##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
##  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"        
## [19] "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"

TxDb packages

known gene models, as GRanges / GRangesList

Easy to make your own, from GFF files GenomicFeatures::makeTxDbFromGFF() & friends

library(TxDb.Hsapiens.UCSC.hg19.knownGene)

## Loading required package: GenomicFeatures

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
exons(txdb)

## GRanges object with 289969 ranges and 1 metadata column:
##                  seqnames         ranges strand   |   exon_id
##                     <Rle>      <IRanges>  <Rle>   | <integer>
##        [1]           chr1 [11874, 12227]      +   |         1
##        [2]           chr1 [12595, 12721]      +   |         2
##        [3]           chr1 [12613, 12721]      +   |         3
##        [4]           chr1 [12646, 12697]      +   |         4
##        [5]           chr1 [13221, 14409]      +   |         5
##        ...            ...            ...    ... ...       ...
##   [289965] chrUn_gl000241 [35706, 35859]      -   |    289965
##   [289966] chrUn_gl000241 [36711, 36875]      -   |    289966
##   [289967] chrUn_gl000243 [11501, 11530]      +   |    289967
##   [289968] chrUn_gl000243 [13608, 13637]      +   |    289968
##   [289969] chrUn_gl000247 [ 5787,  5816]      -   |    289969
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

exonsBy(txdb, "tx")

## GRangesList object of length 82960:
## $1 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12613, 12721]      + |         3        <NA>         2
##   [3]     chr1 [13221, 14409]      + |         5        <NA>         3
## 
## $2 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12595, 12721]      + |       2      <NA>         2
##   [3]     chr1 [13403, 14409]      + |       6      <NA>         3
## 
## $3 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12646, 12697]      + |       4      <NA>         2
##   [3]     chr1 [13221, 14409]      + |       5      <NA>         3
## 
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

p <- promoters(txdb)

BSgenome

Whole-genome sequences

Possible to make your own, or use other formats – Rsamtools::FaFile(), tracklayer::TwoBitFile()

library(BSgenome.Hsapiens.UCSC.hg19)

## Loading required package: BSgenome
## Loading required package: rtracklayer

bsgenome <- BSgenome.Hsapiens.UCSC.hg19
ps <- getSeq(bsgenome, p)
ps

##   A DNAStringSet instance of length 82960
##         width seq
##     [1]  2200 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...GGCCTGTGCCAGGGTGCAAGCTGAGCACTGGAGTGGAGTTT
##     [2]  2200 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...GGCCTGTGCCAGGGTGCAAGCTGAGCACTGGAGTGGAGTTT
##     [3]  2200 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...GGCCTGTGCCAGGGTGCAAGCTGAGCACTGGAGTGGAGTTT
##     [4]  2200 TTAAGGTCTATTCTAAATTGCACACTTTGATTCAAAAGAAAC...TTCCTGCTAGCCAACCTCTCACTCATTGATCTGTCTCTGTC
##     [5]  2200 ATTGTGAAGGATACATCTCAGAAACAGTCAATGAAAGAGACG...CTCCAGGCTCTGAACTTTCTCAGTAAGTTCAGGTAGCTGGG
##     ...   ... ...
## [82956]  2200 AAATGCAAAATTAGCTGGGCGTCGTGGCGCATGCCTGTAATC...AGGGTGGCCTGAGCAGTAGGATTGGGGCTGGAGCAGTAAGA
## [82957]  2200 AAACTGACTGTGACAATAGAAGGGAGAGACATGAATTTATCT...GAAGAGGTGGGTCCTGCAGCTGTGGCGGGAGCCTCCTCAGT
## [82958]  2200 CAAGGGCCTCACTGATGAGGACATGCCCACCAGGGTCTGCTG...CTGGTGTCCCTGAGACAGCACTAACAGGTCCATGGCTGGGT
## [82959]  2200 TGACTGTCGTAAGAGCTTCCTTGTATATGAGGATGATGTCCA...GGCTGCTTTCTGCACTTCAAAATAAAGGCCTCCTGAAGATG
## [82960]  2200 AAGGGCCTCACTGATGAGGACACGCCCACCAGGGTCTGCTGA...CTTGTGTCCCTGAGACAGCACTAACAGGTCCATGGCTGGGT

hist(letterFrequency(ps, "GC", as.prob=TRUE))

AnnotationHub

easily accessible genome-scale resources

Example: Ensembl ‘GTF’ files to R / Bioconductor GRanges and TxDb

library(AnnotationHub)
hub <- AnnotationHub()
hub
query(hub, c("Ensembl", "80", "gtf"))
## ensgtf = display(hub)                   # visual choice
hub["AH47107"]
gtf <- hub[["AH47107"]]
gtf
txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)

Example: non-model organism OrgDb packages

library(AnnotationHub)
hub <- AnnotationHub()
query(hub, "OrgDb")

Example: Map Roadmap epigenomic marks to hg38

Roadmap BED file as GRanges

library(AnnotationHub)
hub <- AnnotationHub()
query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2"))
E126 <- hub[["AH29817"]]

UCSC ‘liftOver’ file to map coordinates

query(hub , c("hg19", "hg38", "chainfile"))
chain <- hub[["AH14150"]]

lift over – possibly one-to-many mapping, so GRanges to GRangesList
```
library(rtracklayer)
E126hg38 <- liftOver(E126, chain)
E126hg38
```

4 Alignments

Integrative Genomics Viewer

Create an ‘igv’ directory (if it does not already exist) and add the file hg19_alias.tab to it. This is a simple tab-delimited file that maps between the sequence names used by the alignment, and the sequence names known to IGV.
Start igv.
Choose hg19 from the drop-down menu at the top left of the screen
Use File -> Load from File menu to load a bam file, e.g., /mnt/nfs/practicals/day1/martin_morgan/SRR1039508_sorted.bam
Zoom in to a particular gene, e.g., SPARCL1, by entering the gene symbol in the box toward the center of the browser window. Adjust the zoom until reads come in to view, and interpret the result.

mkdir -p ~/igv/genomes
cp bigdata/hg19_alias.tab ~/igv/genomes/
igv

Bioconductor: we’ll explore how to map between different types of identifiers, how to navigate genomic coordinates, and how to query BAM files for aligned reads.

Attach ‘Annotation’ packages containing information about gene symbols org.Hs.eg.db and genomic coordinates (e.g., genes, exons, cds, transcripts) r Biocannopkg(TxDb.Hsapiens.UCSC.hg19.knownGene). Arrange for the ‘seqlevels’ (chromosome names) in the TxDb package to match those in the BAM files.
Use the org.* package to map from gene symbol to Entrez gene id, and the TxDb.* package to retrieve gene coordinates of the SPARCL1 gene. N.B. – The following uses a single gene symbol, but we could have used 1, 2, or all gene symbols in a vectorized fashion.
Attach the GenomicAlignments package for working with aligned reads. Use range() to get the genomic coordinates spanning the first and last exon of SPARCL1. Input paired reads overlapping SPARCL1.

What questions can you easily answer about these alignments? E.g., how many reads overlap this region of interest?

## 1.a 'Annotation' packages
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

## 1.b -- map 'seqlevels' as recorded in the TxDb file to those in the
## BAM file
fl <- "~/igv/genomes/hg19_alias.tab"
map <- with(read.delim(fl, header=FALSE, stringsAsFactors=FALSE),
    setNames(V1, V2))
seqlevels(txdb, force=TRUE) <- map

## 2. Symbol -> Entrez ID -> Gene coordinates
sym2eg <- mapIds(org.Hs.eg.db, "SPARCL1", "ENTREZID", "SYMBOL")
exByGn <- exonsBy(txdb, "gene")
sparcl1exons <- exByGn[[sym2eg]]

## 3. Aligned reads
library(GenomicAlignments)
## replace 'bigdata' with '/mnt/nfs/practicals/day1/martin_morgan/'
fl <- "bigdata/SRR1039508_sorted.bam"
sparcl1gene <- range(sparcl1exons)
param <- ScanBamParam(which=sparcl1gene)
aln <- readGAlignmentPairs(fl, param=param)

As another exercise we ask how many of the reads we’ve input are compatible with the known gene model. We have to find the transcripts that belong to our gene, and then exons grouped by transcript.

## 5.a. exons-by-transcript for our gene of interest
txids <- select(txdb, sym2eg, "TXID", "GENEID")$TXID

## 'select()' returned 1:many mapping between keys and columns

exByTx <- exonsBy(txdb, "tx")[txids]

## 5.b compatible alignments
hits <- findCompatibleOverlaps(query=aln, subject=exByTx)
good <- seq_along(aln) %in% queryHits(hits)
table(good)

## good
## FALSE  TRUE 
##    14    55

Finally, let’s go from gene model to protein coding sequence. (a) Extract CDS regions grouped by transcript, select just transcripts we’re interested in, (b) attach and then extract the coding sequence from the appropriate reference genome. Translating the coding sequences to proteins.

## reset seqlevels
restoreSeqlevels(txdb)

## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
## # GenomicFeatures version at creation time: 1.21.30
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1

## a. cds coordinates, grouped by transcript
txids <- mapIds(txdb, sym2eg, "TXID", "GENEID")
cdsByTx <- cdsBy(txdb, "tx")[txids]

## b. coding sequence from relevant reference genome
library(BSgenome.Hsapiens.UCSC.hg19)
dna <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19, cdsByTx)
protein <- translate(dna)

Case Studies

Martin Morgan (martin.morgan@roswellpark.org)
Roswell Park Cancer Institute, Buffalo, NY
19 October, 2015

Contents

1 R data manipulation

2 Short read quality assessment

3 Annotation

4 Alignments

5 biomaRt annotations

Case Studies

Martin Morgan (martin.morgan@roswellpark.org) Roswell Park Cancer Institute, Buffalo, NY 19 October, 2015

Contents

1 R data manipulation

2 Short read quality assessment

3 Annotation

4 Alignments

5 biomaRt annotations

Martin Morgan (martin.morgan@roswellpark.org)
Roswell Park Cancer Institute, Buffalo, NY
19 October, 2015