## ---- echo=FALSE, results="hide", warning=FALSE-------------------------- suppressPackageStartupMessages({ library(ChIPpeakAnno) library(rtracklayer) library(EnsDb.Hsapiens.v75) library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(org.Hs.eg.db) library(reactome.db) library(BSgenome.Hsapiens.UCSC.hg19) }) knitr::opts_chunk$set(warning=FALSE, message=FALSE) i <- 0 generateFigureCaption <- function(cap){ i <<- i+1 return(paste0("Figure ", i, ". ", cap)) } ## ----workflow1, fig.cap=generateFigureCaption("Venn diagram of overlaps for replicated experiments"), fig.width=6, fig.height=6---- library(ChIPpeakAnno) bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno") gr1 <- toGRanges(bed, format="BED", header=FALSE) ## one can also try import from rtracklayer gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno") gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3) ## must keep the class exactly same as gr1$score, i.e., numeric. gr2$score <- as.numeric(gr2$score) ol <- findOverlapsOfPeaks(gr1, gr2) ## add metadata (mean of score) to the overlapping peaks ol <- addMetadata(ol, colNames="score", FUN=mean) ol$peaklist[["gr1///gr2"]][1:2] makeVennDiagram(ol) ## ----annoData------------------------------------------------------------ library(EnsDb.Hsapiens.v75) ##(hg19) ## create annotation file from EnsDb or TxDb annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene") annoData[1:2] ## ----workflow2,fig.cap=generateFigureCaption("Distribution of peaks around transcript start sites."),fig.width=8,fig.height=6---- overlaps <- ol$peaklist[["gr1///gr2"]] binOverFeature(overlaps, annotationData=annoData, radius=5000, nbins=20, FUN=length, errFun=0, ylab="count", main="Distribution of aggregated peak numbers around TSS") ## ----workflow4,fig.cap=generateFigureCaption("Peak distribution over different genomic features."),fig.width=10,fig.height=4---- library(TxDb.Hsapiens.UCSC.hg19.knownGene) aCR<-assignChromosomeRegion(overlaps, nucleotideLevel=FALSE, precedence=c("Promoters", "immediateDownstream", "fiveUTRs", "threeUTRs", "Exons", "Introns"), TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene) barplot(aCR$percentage, las=3) ## ----workflow3----------------------------------------------------------- overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, output="nearestBiDirectionalPromoters", bindingRegion=c(-2000, 500)) library(org.Hs.eg.db) overlaps.anno <- addGeneIDs(overlaps.anno, "org.Hs.eg.db", IDs2Add = "entrez_id") head(overlaps.anno) write.csv(as.data.frame(unname(overlaps.anno)), "anno.csv") ## ----workflow20,fig.cap=generateFigureCaption("Pie chart of the distribution of common peaks around features.")---- pie1(table(overlaps.anno$insideFeature)) ## ----enrichment---------------------------------------------------------- over <- getEnrichedGO(overlaps.anno, orgAnn="org.Hs.eg.db", maxP=.05, minGOterm=10, multiAdjMethod="BH", condense=TRUE) head(over[["bp"]][, -c(3, 10)]) library(reactome.db) path <- getEnrichedPATH(overlaps.anno, "org.Hs.eg.db", "reactome.db", maxP=.05) head(path) ## ----fasta--------------------------------------------------------------- library(BSgenome.Hsapiens.UCSC.hg19) seq <- getAllPeakSequence(overlaps, upstream=20, downstream=20, genome=Hsapiens) write2FASTA(seq, "test.fa") ## ----consensus,fig.cap=generateFigureCaption("Histogram of Z-score of 6-mer"),fig.height=6,fig.width=6---- ## summary of the short oligos freqs <- oligoFrequency(Hsapiens$chr1, MarkovOrder=3) os <- oligoSummary(seq, oligoLength=6, MarkovOrder=3, quickMotif=FALSE, freqs=freqs) ## plot the results zscore <- sort(os$zscore) h <- hist(zscore, breaks=100, xlim=c(-50, 50), main="Histogram of Z-score") text(zscore[length(zscore)], max(h$counts)/10, labels=names(zscore[length(zscore)]), adj=1) ## ----simulation, fig.cap=generateFigureCaption("Histogram of Z-score of simulation data"), fig.width=6, fig.height=6---- ## We can also try simulation data seq.sim.motif <- list(c("t", "g", "c", "a", "t", "g"), c("g", "c", "a", "t", "g", "c")) set.seed(1) seq.sim <- sapply(sample(c(2, 1, 0), 1000, replace=TRUE, prob=c(0.07, 0.1, 0.83)), function(x){ s <- sample(c("a", "c", "g", "t"), sample(100:1000, 1), replace=TRUE) if(x>0){ si <- sample.int(length(s), 1) if(si>length(s)-6) si <- length(s)-6 s[si:(si+5)] <- seq.sim.motif[[x]] } paste(s, collapse="") }) os <- oligoSummary(seq.sim, oligoLength=6, MarkovOrder=3, quickMotif=TRUE) zscore <- sort(os$zscore, decreasing=TRUE) h <- hist(zscore, breaks=100, main="Histogram of Z-score") text(zscore[1:2], rep(5, 2), labels=names(zscore[1:2]), adj=0, srt=90) ## ----simulation.motif, fig.cap=generateFigureCaption("Motif of simulation data"), fig.width=6, fig.height=6---- ## generate the motifs library(motifStack) pfms <- mapply(function(.ele, id) new("pfm", mat=.ele, name=paste("SAMPLE motif", id)), os$motifs, 1:length(os$motifs)) motifStack(pfms[[1]]) ## ----peaksNearBDP16------------------------------------------------------ bdp <- peaksNearBDP(overlaps, annoData, maxgap=5000) c(bdp$percentPeaksWithBDP, bdp$n.peaks, bdp$n.peaksWithBDP) bdp$peaksWithBDP[1:2] ## ----findEnhancers------------------------------------------------------- DNA5C <- system.file("extdata", "wgEncodeUmassDekker5CGm12878PkV2.bed.gz", package="ChIPpeakAnno") DNAinteractiveData <- toGRanges(gzfile(DNA5C)) findEnhancers(overlaps, annoData, DNAinteractiveData) ## ----importData---------------------------------------------------------- path <- system.file("extdata", package="ChIPpeakAnno") files <- dir(path, "broadPeak") data <- sapply(file.path(path, files), toGRanges, format="broadPeak") names(data) <- gsub(".broadPeak", "", files) ## ----vennDiagram, fig.cap=generateFigureCaption("Venn diagram of overlaps."), fig.width=6, fig.height=6---- ol <- findOverlapsOfPeaks(data, connectedPeaks="keepAll") averagePeakWidth <- mean(width(unlist(GRangesList(ol$peaklist)))) tot <- ceiling(3.3e+9 * .03 / averagePeakWidth / 24) makeVennDiagram(ol, totalTest=tot, connectedPeaks="keepAll") ## ----peakPermTest1, fig.cap=generateFigureCaption("permutation test for YY1 and TEAD4")---- data(HOT.spots) data(wgEncodeTfbsV3) hotGR <- reduce(unlist(HOT.spots)) removeOl <- function(.ele){ ol <- findOverlaps(.ele, hotGR) if(length(ol)>0) .ele <- .ele[-unique(queryHits(ol))] .ele } TAF <- removeOl(data[["TAF"]]) TEAD4 <- removeOl(data[["Tead4"]]) YY1 <- removeOl(data[["YY1"]]) # we subset the pool to save demo time set.seed(1) wgEncodeTfbsV3.subset <- wgEncodeTfbsV3[sample.int(length(wgEncodeTfbsV3), 2000)] pool <- new("permPool", grs=GRangesList(wgEncodeTfbsV3.subset), N=length(YY1)) pt1 <- peakPermTest(YY1, TEAD4, pool=pool, seed=1, force.parallel=FALSE) plot(pt1) ## ----peakPermTest2, fig.cap=generateFigureCaption("permutation test for YY1 and TAF")---- pt2 <- peakPermTest(YY1, TAF, pool=pool, seed=1, force.parallel=FALSE) plot(pt2) ## ----heatmap,fig.cap=generateFigureCaption("Heatmap of aligned features sorted by signal of TAF"),fig.width=4,fig.height=6---- features <- ol$peaklist[[length(ol$peaklist)]] feature.recentered <- reCenterPeaks(features, width=4000) ## here we also suggest importData function in bioconductor trackViewer package ## to import the coverage. ## compare rtracklayer, it will save you time when handle huge dataset. library(rtracklayer) files <- dir(path, "bigWig") if(.Platform$OS.type != "windows"){ cvglists <- sapply(file.path(path, files), import, format="BigWig", which=feature.recentered, as="RleList") }else{## rtracklayer can not import bigWig files on Windows load(file.path(path, "cvglist.rds")) } names(cvglists) <- gsub(".bigWig", "", files) feature.center <- reCenterPeaks(features, width=1) sig <- featureAlignedSignal(cvglists, feature.center, upstream=2000, downstream=2000) ##Because the bw file is only a subset of the original file, ##the signals are not exists for every peak. keep <- rowSums(sig[[2]]) > 0 sig <- sapply(sig, function(.ele) .ele[keep, ], simplify = FALSE) feature.center <- feature.center[keep] heatmap <- featureAlignedHeatmap(sig, feature.center, upstream=2000, downstream=2000, upper.extreme=c(3,.5,4)) ## ----sortHeatmapByHcluster,fig.cap=generateFigureCaption("Heatmap of aligned features sorted by hclut"),fig.width=4,fig.height=6---- sig.rowsums <- sapply(sig, rowSums, na.rm=TRUE) d <- dist(sig.rowsums) hc <- hclust(d) feature.center$order <- hc$order heatmap <- featureAlignedHeatmap(sig, feature.center, upstream=2000, downstream=2000, upper.extreme=c(3,.5,4), sortBy="order") ## ----distribution,fig.cap=generateFigureCaption("Distribution of aligned features"),fig.width=6,fig.height=6---- featureAlignedDistribution(sig, feature.center, upstream=2000, downstream=2000, type="l")