```{r style, echo = FALSE} BiocStyle::markdown() ``` Mapping proteins to their genomic coordinates ============================================= [Laurent Gatto](http://cpu.sysbiol.cam.ac.uk/) ```{r, env, message=FALSE, echo=FALSE, cache=FALSE, warning=FALSE} library("Gviz") library("Pbase") library("Biostrings") library("biomaRt") library("BSgenome.Hsapiens.UCSC.hg19") library("ggplot2") ``` ## Introduction The aim of this document is to document to mapping of proteins and the tandem mass spectrometry derived peptides they have been inferred from to genomic locations. ```{r schema, echo=FALSE, fig.cap='Mapping proteins to a genome reference.', fig.align='center'} par(mar = rep(0, 4), oma = rep(0, 4)) xlim <- c(0, 10) ylim <-c(5.05, 10.25) plot(0, type = "n", axes = FALSE, xlab = "", ylab = "", xlim = xlim, ylim = ylim) y <- 10 g <- c(0, 10) exons <- list(c(0, 2), c(4.5, 5), c(6, 7.1), c(9, 10)) trans <- list(c(1, 2, 3, 4), c(1, 3, 4), c(1, 4)) prot <- list(exons[[1]] + c(0.5, 0), ## 5'UTR exons[[3]], exons[[4]] + c(0, -0.5)) ## 3'UTR peps <- list(c(1, 1.29), c(6.1, 6.3), c(6.85, 7), c(9.1, 9.25)) rect(g[1], y-0.25, g[2], y+0.25, col = "black") abline(h = y, lwd = 2) text(g[1], y+0.25, expression(g[s]), pos = 3) text(g[2], y+0.25, expression(g[e]), pos = 3) text(0-0.26, y, "G", pos = 3) abline(v = c(unlist(exons), unlist(prot), unlist(peps)), lty = "dotted", col = "lightgrey") y <- y - 1 for (i in 1:length(trans)) { tr <- trans[[i]] abline(h = y, lty = "dotted") ## text(0-0.26, y, expression(T[x]), pos = 3) text(0-0.26, y, substitute(paste("T", list(x)), list(x = i)), pos = 3) for (j in 1:length(tr)) { e1 <- exons[[tr[j]]][1] e2 <- exons[[tr[j]]][2] rect(e1, y-0.25, e2, y+0.25, col = "grey") if (i == 1) { text(e1, y+0.25, expression(e[s]^i), pos = 3) text(e2, y+0.25, expression(e[e]^i), pos = 3) } text(mean(c(e1, e2)), y, paste0("i = ", tr[j]), cex = .7) } y <- y - .6 } y <- y - .4 abline(h = y, lty = "dotted") text(0-0.26, y, expression(P), pos = 3) for (i in 1:length(prot)) { pr <- prot[[i]] rect(pr[1], y - 0.25, pr[2], y + 0.25, col = "steelblue") text(pr[1], y + 0.25, expression(p[s]^j), pos = 3) text(pr[2], y + 0.25, expression(p[e]^j), pos = 3) text(mean(c(pr[1], pr[2])), y, paste0("j = ", i), cex = .7) } y <- y - .75 ## concat protein ## center x <- mean(xlim) protlen <- sum(sapply(prot, diff)) X0 <- x0 <- x - protlen/2 ## left start abline(h = y, lty = "dotted") text(0-0.26, y, expression(P), pos = 3) for (i in 1:length(prot)) { pr <- prot[[i]] .pr1 <- x0 .pr2 <- x0 + (pr[2] - pr[1]) rect(.pr1, y - 0.25, .pr2, y + 0.25, col = "steelblue") segments(.pr1, y + 0.25, pr[1], y + .75 - .25, lty = "dotted") segments(.pr2, y + 0.25, pr[2], y + .75 - .25, lty = "dotted") text(mean(c(.pr1,.pr2)), y, paste0("j = ", i), cex = .7) x0 <- .pr2 } text(X0, y, expression(1), pos = 2) text(X0 + protlen, y, expression(L[P]), pos = 4) ## pos and length relpeps <- list(c(0.5, 0.29), ## 1.% is length of exon 1 c(1.5 + 0.1, 0.2), c(1.5 + 0.85, 0.15), ## 1.1 is length of exon 2 c(1.5 + 1.1 + 0.1, 0.15)) for (i in 1:length(relpeps)) { rp <- relpeps[[i]] pep <- peps[[i]] rect(X0 + rp[1], y - 0.25, X0 + rp[1] + rp[2], y + 0.25, col = "#FFA50450", lwd = 0) segments(X0 + rp[1], y - 0.25, pep[1], y - .75 + .25, lty = "dotted") segments(X0 + rp[1] + rp[2], y - 0.25, pep[2], y - .75 + .25, lty = "dotted") } y <- y - .75 abline(h = y, lty = "dotted") text(0-0.26, y, expression(Pi), pos = 3) for (i in 1:length(peps)) { pep <- peps[[i]] rect(pep[1], y - 0.25, pep[2], y + 0.25, col = "#FFA504FF") text(pep[1], y + 0.25, expression(pi[s]^k), pos = 3, cex = .7) text(pep[2], y + 0.25, expression(pi[e]^k), pos = 3, cex = .7) text(mean(c(pep[1], pep[2])), y-0.35, paste0("k = ", i), cex = .8) } ``` ## Proteins and genome data ```{r, echo=FALSE} data(p) sp <- seqnames(p)[5] ``` We will use a small object from `Pbase` to illustrate how to retrieve genome coordinates and map a protein back to genomic coordinates. In the remainder of this vignette, we will concentrate on a spectific protein, namely `r sp`. ```{r, p, message=FALSE} library("Pbase") data(p) p seqnames(p) dat <- data.frame(i = 1:length(p), npeps = elementLengths(pfeatures(p)), protln = width(aa(p))) dat sp <- seqnames(p)[5] ``` ### Querying with `biomaRt` The input information consists of a UniProt identifier and a set of peptides positions along the protein. The first step is to map the protein accession number to a gene identifier (here, we will use Ensembl) and to obtain genome coordinates. Multiple solutiona are offered to use. Below, we use the `biomaRt` Bioconductor package. ```{r bm, cache=TRUE, message=FALSE} library("biomaRt") ens <- useMart("ensembl", "hsapiens_gene_ensembl") bm <- select(ens, keys = sp, keytype = "uniprot_swissprot_accession", columns = c( "ensembl_gene_id", "ensembl_transcript_id", "ensembl_peptide_id", "ensembl_exon_id", "description", "chromosome_name", "strand", "exon_chrom_start", "exon_chrom_end", "5_utr_start", "5_utr_end", "3_utr_start", "3_utr_end", "start_position", "end_position")) bm ``` We will only consider the gene located on the X chromosome and ignore the [sequence](http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/patches.shtml) [patch](http://www.ncbi.nlm.nih.gov/nuccore/NW_003871101.3?report=genbank) and conveniently store the transcript and protein identifiers for later use. ```{r} bm <- bm[bm$chromosome_name == "X", ] tr <- unique(bm$ensembl_transcript_id) pr <- unique(bm$ensembl_peptide_id) ``` The information retrieved gives us various information, in particular the Ensembl gene identifer `r bm$ensembl_gene_id[1]` and its starting and ending position `r bm$start_position[1]` and `r bm$end_position[1]` on chromosome `r bm$chromosome_name[1]`. ### Genome visualisation with `Gviz` While the above defines all necessary genomic coordinates, and as we will make use of the `Gviz` plotting infrastructure, it is more straightforward to fetch the above information via `biomaRt` using specialised `Gviz` infrastructure. ```{r gviz, cache=TRUE, message=FALSE} library("Gviz") options(ucscChromosomeNames=FALSE) bmTrack <- BiomartGeneRegionTrack(start = min(bm$start_position), end = max(bm$end_position), chromosome = "chrX", genome = "hg19") ## see also the filters param and getBM to ## filter on biotype == "protein_coding" bmTracks <- split(bmTrack, transcript(bmTrack)) grTrack <- bmTracks[[which(names(bmTracks) == tr)]] names(grTrack) <- tr ``` We can convince ourselves that the manual retrieved data in `bm` and the data in the `BiomartRegionTrack` subsequently subset into a `GeneRegionTrack` match. ```{r gvizdfr} as(grTrack, "data.frame") ``` Below, we create additional ideogram and axis tracks and produce a visualisation of our genomic coordinates of interest. ```{r gvizfig, fig.align='center', cache=TRUE} ideoTrack <- IdeogramTrack(genome = "hg19", chromosome = "chrX") axisTrack <- GenomeAxisTrack() seqlevels(ranges(grTrack)) <- chromosome(grTrack) <- chromosome(ideoTrack) plotTracks(list(ideoTrack, axisTrack, grTrack), add53 = TRUE, add35 = TRUE) ``` ### Using `TranscriptDb` instances Finally, on could also use `TranscriptDb` objects to create the `GeneRegionTrack`. Below, we use the UCSC annotation (which is directly available from Bioconductor; Ensembl TranscriptDb instances could be generated manually - **TODO** document this) to extract our regions of interest. ```{r, txdb, cache=TRUE, message=FALSE} library("TxDb.Hsapiens.UCSC.hg19.knownGene") txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene txTr <- GeneRegionTrack(txdb, chromosome = "chrX", start = bm$start_position[1], end = bm$end_position[1]) head(as(txTr, "data.frame")) ``` ## Peptide features ```{r, pp0, echo=FALSE} pp <- p[sp] n <- elementLengths(pranges(pp)) pepstring <- paste(unique(pcols(pp)[[1]]$pepseq), collapse = ", ") ``` The peptides that have been experimentally observed are available as ranges (coordinates) along the protein sequences. For example, below, we see `r n` peptides (`r pepstring`) have been identified for our protein of interest `r sp`. ```{r, pepsfig, fig.align='center'} pp <- p[sp] pranges(pp) plot(pp) ``` The aim of this document is to document the mapping of peptides, i.e. ranges along a protein sequence to ranges along the genome reference. In other words, our aim is the convert protein coordinates to genome coordinates. ## Mapping peptides back to the genome Additional features of interest (in our case peptides) can be added to the genomic visualisation using dedicated annotation tracks, that can be constructed as shown below. ### Comparing protein and translated DNA sequences The first check that we want to implement is to verify that we can regenerate the protein amino acid sequence from the genome regions that we have extracted. We start by subsetting only the actual protein coding regions, i.e ignoring the 5' and 3' untranslated regions of our Genome Region track. ```{r, protranges} (prng <- ranges(grTrack[feature(grTrack) == "protein_coding",])) ``` We also need the actual genome sequence (so far, we have only dealt with regions and features). There are multiple ways to obtain genome sequences with R and Bioconductor. As we have been working with the Ensembl reference genome, we could simply [download](ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/) the whole genome of only the chromosome X and load it as an `AAStringSet` and extract the protein coding regions of interest at the coordinates defined by the ranges in `prng`. ```{r, extractat, eval=FALSE} library("Biostrings") chrx <- readDNAStringSet("./Homo_sapiens.GRCh37.75.dna.chromosome.X.fa.gz") seq <- extractAt(chrx[[1]], ranges(prng)) pptr <- translate(unlist(seq)) ``` It is also possible to use readily package genomes to do this. Above, we have use the `TxDb.Hsapiens.UCSC.hg19.knownGene` package defining transcripts. There is a similar package for the human UCSC genome build, namely `BSgenome.Hsapiens.UCSC.hg19`. However, as we have focused on Ensembl data annotation, we will first need to convert our Ensembl transcript (or protein) identifier `r tr` (`r pr`) into the UCSC equivalent. (**TODO** Use also `Homo.sapiens` annotation package.) ```{r, protfromgenome} ucsc <- select(ens, key = pr, keytype = "ensembl_peptide_id", columns = "ucsc") ucsc library("BSgenome.Hsapiens.UCSC.hg19") prng2 <- ranges(txTr[transcript(txTr) %in% ucsc]) prng2 <- prng2[prng2$feature == "CDS"] seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19, prng2) pptr <- translate(unlist(seq)) ## remove stop codon pptr <- pptr[1:417] ``` ```{r, align} writePairwiseAlignments(pairwiseAlignment(pp[[1]], pptr)) ``` ### Calculating new coordinates To calculate the genomic coordinates of peptides we 1) Calculate the contiguous exon coordinates on the cDNA sequence `prex`. 2) Calculate the peptide corrdinates on the cDNA sequence `peprgn2` from the original peptide ranges along the protein sequence. 3) We calculate the indices of the exons in which all the peptides start and end on the cDNA sequence `start_ex` and `end_ex`. 4) We infer the coordinates on the genome from the actual peptide starting/ending position and exon index on the cDNA sequence, the exons coordinates on the cDNA sequences and the matching exons coordinates on the genome and store these in an `IRanges` object. ```{r, mapping} ## exons ranges along the protein j <- cumsum(width(seq)) i <- cumsum(c(1, width(seq)))[1:length(j)] prex <- IRanges(start = i, end = j) peprng <- pranges(pp)[[1]] ## peptide positions on cdna peprng2 <- IRanges(start = 1 + (start(peprng)-1) * 3, width = width(peprng) * 3) ## find exon and position in prex start_ex <- subjectHits(findOverlaps(start(peprng2), prex)) end_ex <- subjectHits(findOverlaps(end(peprng2), prex)) getPos <- function(p, i, prtex = prex, nclex = prng2) { ## position in cdna ## exon index start(prng2[i]) + (p - start(prtex[i])) } peptides_on_genome <- IRanges(start = getPos(start(peprng2), start_ex), end = getPos(end(peprng2), end_ex)) ``` ### Plotting Based on the new peptide genomic coordinates, it is now straightforward to create a new `AnnotationTrack` and add it the the track visualisation. ```{r, pepcoords, fig.align='center'} pepTr <- AnnotationTrack(start = start(peptides_on_genome), end = end(peptides_on_genome), chr = "chrX", genome = "hg19", strand = "*", id = pcols(pp)[[1]]$pepseq, name = "pfeatures", col = "steelblue") plotTracks(list(ideoTrack, axisTrack, grTrack, pepTr), groupAnnotation = "id", just.group = "below", fontsize.group = 9, add53 = TRUE, add35 = TRUE) ``` ### DetailsAnnotationTrack Finally, we customise the figure by adding a track with the $MS^2$ spectra. The raw data used to search the protein database an create `p` are available as an `MSnExp` object. ```{r, msmsspectra, fig.align='center'} data(pms) library("ggplot2") details <- function(identifier, ...) { p <- plot(pms[[as.numeric(identifier)]], full=TRUE, plot=FALSE) + ggtitle("") p <- p + theme_bw() + theme(axis.text.y = element_blank(), axis.text.x = element_blank()) + labs(x = NULL, y = NULL) print(p, newpage=FALSE) } deTrack <- AnnotationTrack(start = start(peptides_on_genome), end = end(peptides_on_genome), genome = "hg19", chromosom = "chrX", id = pcols(pp)[[1]]$acquisitionnum, name = "MS2 spectra", stacking = "squish", fun = details) plotTracks(list(ideoTrack, axisTrack, deTrack, grTrack), add53 = TRUE, add35 = TRUE) ``` **TODO** Check spectra. Describe how data tracks can be used to overlay additional information, such as quantitation data, identification scores, coverage, ... ## Session information ```{r, si} sessionInfo() ```