## ----style, echo=FALSE, results='asis', message=FALSE------------------------- knitr::opts_chunk$set( tidy = FALSE, warning = FALSE, message = FALSE ) library(yulab.utils) Biocannopkg <- yulab.utils::Biocpkg ## ----echo=FALSE, results='hide', message=FALSE-------------------------------- library(GenomicFeatures) library(GenomicRanges) library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(BSgenome.Hsapiens.UCSC.hg38) library(org.Hs.eg.db) library(ggplot2) library(clusterProfiler) library(ReactomePA) library(epiSeeker) ## ----load-data---------------------------------------------------------------- txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene data(peakAnnoList) # peakAnnoList <- lapply(files, annotateSeq, TxDb=txdb, # tssRegion=c(-3000, 3000), verbose=FALSE) peakAnno <- peakAnnoList[[4]] ## ----read-peak-file----------------------------------------------------------- files <- getSampleFiles() print(files) peak <- readPeakFile(files[[4]]) peak ## ----cov-plot, fig.height=4, fig.width=10------------------------------------- plotCov(peak, weightCol = "V5", chrs = c("chr17", "chr18"), xlim = c(4.5e7, 5e7), interactive = TRUE) ## ----eval=FALSE--------------------------------------------------------------- # # Details of the dataset and pre-processing we used here can be found from xxxx # # We only show usage here for demonstration # pancancer_atac <- readRDS("./data/pancancer_atac.rds") # # fill_color <- c( # "#a10589", "#fce364", "#854d23", "#c0b742", # "#1c670d", "#3ec486", "#f7bbff", "#ee0700", # "#172ec5", "#f56103", "#3f3a95", "#7a7875", # "#e06cdc", "#7effff", "#f9b250", "#af1e18", # "#997f69", "#9d82ed", "#40bcf5", "#f28582", # "#97be8c", "#7b2ed0", "#81f75b" # ) # # design <- "AAAB # AAAB # AAAB # CCC#" # # atac_p <- plotCov(pancancer_atac, # weightCol = "V5", # chrs = "chr8", title = "", # xlim = c(126712193, 128412193), # xlab = "", fill_col = fill_color, # legend_position = "none", facet_var = ".id~.", # add_cluster_tree = TRUE, # add_coaccess = TRUE, # design = design # ) ## ----eval=FALSE--------------------------------------------------------------- # tssTagMatrix <- getTagMatrix( # peak = peak, upstream = 3000, downstream = 3000, weightCol = "V5", # type = "start_site", by = "transcript", TxDb = txdb # ) # # # plot_prof = FALSE will plot only heatmap # plotPeakHeatmap(tssTagMatrix, # plot_prof = TRUE, # statistic_method = "mean", # missingDataAsZero = TRUE, # conf = 0.95, resample = 1000 # ) ## ----eval=FALSE--------------------------------------------------------------- # plotPeakProf(tssTagMatrix, conf = 0.95, resample = 1000) ## ----plot-bm-prof, fig.height=10, fig.width=15, fig.align="center"------------ data(demo_bmdata) bmMatrix <- getBmMatrix( region = data.frame(chr = "chr22", start = 10525991, end = 10526342), BSgenome = BSgenome.Hsapiens.UCSC.hg38, input = demo_bmdata, base = "C", motif = c("CG") ) # Interactive function is also supported by interactive parameter plotBmProf(bmMatrix, interactive = TRUE) # htmlwidgets::saveWidget(p, "base_modification.html") ## ----plot-gene-track, fig.height=3, fig.width=8, fig.align="center"----------- # this region is the same as the region of pancancer figure plotGeneTrack( txdb = txdb, chr = "chr8", start_pos = 126712193, end_pos = 128412193, OrgDb = "org.Hs.eg.db" ) ## ----plot-motif-prof, fig.height=5, fig.width=10, fig.align="center"---------- # motif reference of position-specific weight matrix # opts_base <- list() # opts_base[["collection"]] <- "CORE" # opts_base[["all_versions"]] <- FALSE # opts_human <- opts_base # opts_human[["species"]] <- "Homo sapiens" # opts_human[["tax_group"]] <- "vertebrates" # sq24 <- RSQLite::dbConnect(RSQLite::SQLite(), JASPAR2024::db(JASPAR2024::JASPAR2024())) # pwm_obj <- TFBSTools::getMatrixSet(sq24, opts_human) data(pwm_obj) motifMatrix <- getMotifMatrix( region = GRanges( seqnames = "chr22", ranges = IRanges(start = 10525991, end = 10526342) ), pwm = pwm_obj, ref_obj = BSgenome.Hsapiens.UCSC.hg38 ) # Interactive function is also supported by interactive parameter plotMotifProf(motifMatrix, interactive = TRUE) ## ----annotate-seq, eval = FALSE----------------------------------------------- # peakAnno <- annotateSeq(files[[4]], # tssRegion = c(-3000, 3000), # TxDb = txdb, annoDb = "org.Hs.eg.db" # ) ## ----annotate-seq-edb, eval = FALSE------------------------------------------- # library(EnsDb.Hsapiens.v75) # edb <- EnsDb.Hsapiens.v75 # seqlevelsStyle(edb) <- "UCSC" # # peakAnno.edb <- annotateSeq(files[[4]], # tssRegion = c(-3000, 3000), # TxDb = edb, annoDb = "org.Hs.eg.db" # ) ## ----plot-anno-bar, fig.cap="Genomic Annotation by barplot", fig.align="center", fig.height=4, fig.width=10---- plotAnnoBar(peakAnno) ## ----plot-dist-to-tss, fig.cap="Distribution of Binding Sites", fig.align="center", fig.height=2, fig.width=6---- plotDistToTSS(peakAnno, title = "Distribution of transcription factor-binding loci\nrelative to TSS" ) ## ----enrichment-analysis, eval = FALSE---------------------------------------- # library(ReactomePA) # # pathway1 <- enrichPathway(as.data.frame(peakAnno)$geneId) # head(pathway1, 2) # ## ID Description GeneRatio BgRatio RichFactor # ## R-HSA-9830369 R-HSA-9830369 Kidney development 19/506 46/11214 0.4130435 # ## R-HSA-9758941 R-HSA-9758941 Gastrulation 27/506 115/11214 0.2347826 # ## FoldEnrichment zScore pvalue p.adjust qvalue # ## R-HSA-9830369 9.153892 12.045870 2.601506e-14 2.796619e-11 2.760335e-11 # ## R-HSA-9758941 5.203265 9.848629 8.375539e-13 4.501852e-10 4.443444e-10 # ## geneID # ## R-HSA-9830369 2625/5076/7490/652/6495/2303/3975/6928/10736/5455/7849/3237/6092/2122/255743/2296/3400/28514/2138 # ## R-HSA-9758941 5453/5692/5076/5080/7546/3169/652/5015/2303/5717/3975/2627/5714/344022/7849/5077/2637/7022/8320/7545/6657/4487/51176/2296/28514/2626/64321 # ## Count # ## R-HSA-9830369 19 # ## R-HSA-9758941 27 # # gene <- seq2gene(peak, tssRegion = c(-1000, 1000), flankDistance = 3000, TxDb = txdb) # pathway2 <- enrichPathway(gene) # head(pathway2, 2) # ## ID Description GeneRatio BgRatio RichFactor # ## R-HSA-9830369 R-HSA-9830369 Kidney development 17/415 46/11214 0.3695652 # ## R-HSA-9758941 R-HSA-9758941 Gastrulation 24/415 115/11214 0.2086957 # ## FoldEnrichment zScore pvalue p.adjust qvalue # ## R-HSA-9830369 9.986276 11.971929 2.162576e-13 2.175552e-10 2.114772e-10 # ## R-HSA-9758941 5.639309 9.802874 3.528656e-12 1.774914e-09 1.725327e-09 # ## geneID # ## R-HSA-9830369 2625/5076/3227/652/6495/2303/3975/3237/6092/2122/255743/7490/6928/7849/5455/2296/28514 # ## R-HSA-9758941 5453/5692/5076/7546/3169/652/2303/5717/3975/2627/5714/344022/2637/8320/7545/7020/2626/5080/5015/7849/51176/2296/64321/28514 # ## Count # ## R-HSA-9830369 17 # ## R-HSA-9758941 24 # # dotplot(pathway2) ## ----plot-anno-bar-list, fig.cap="Genomic Annotation among different ChIPseq data", fig.align="center", fig.height=4, fig.width=6---- plotAnnoBar(peakAnnoList) ## ----compare-cluster, eval = FALSE-------------------------------------------- # genes <- lapply(peakAnnoList, function(i) as.data.frame(i)$geneId) # names(genes) <- sub("_", "\n", names(genes)) # compKEGG <- compareCluster( # geneCluster = genes, # fun = "enrichKEGG", # pvalueCutoff = 0.05, # pAdjustMethod = "BH" # ) # dotplot(compKEGG, showCategory = 15, title = "KEGG Pathway Enrichment Analysis") ## ----venn-plot, fig.cap="Overlap of annotated genes", fig.align="center", fig.height=7, fig.width=7---- genes <- lapply(peakAnnoList, function(i) as.data.frame(i)$geneId) vennplot(genes) ## ----shuffle-peak------------------------------------------------------------- p <- GRanges( seqnames = c("chr1", "chr3"), ranges = IRanges(start = c(1, 100), end = c(50, 130)) ) shuffle(p, TxDb = txdb) ## ----enrich-peak-overlap, eval = FALSE---------------------------------------- # enrichPeakOverlap( # queryPeak = files[[5]], # targetPeak = unlist(files[1:4]), # TxDb = txdb, # pAdjustMethod = "BH", # nShuffle = 50, # chainFile = NULL, # verbose = FALSE # ) # ## qSample # ## ARmo_0M GSM1295077_CBX7_BF_ChipSeq_mergedReps_peaks.bed.gz # ## ARmo_1nM GSM1295077_CBX7_BF_ChipSeq_mergedReps_peaks.bed.gz # ## ARmo_100nM GSM1295077_CBX7_BF_ChipSeq_mergedReps_peaks.bed.gz # ## CBX6_BF GSM1295077_CBX7_BF_ChipSeq_mergedReps_peaks.bed.gz # ## tSample qLen tLen N_OL # ## ARmo_0M GSM1174480_ARmo_0M_peaks.bed.gz 1663 812 0 # ## ARmo_1nM GSM1174481_ARmo_1nM_peaks.bed.gz 1663 2296 8 # ## ARmo_100nM GSM1174482_ARmo_100nM_peaks.bed.gz 1663 1359 3 # ## CBX6_BF GSM1295076_CBX6_BF_ChipSeq_mergedReps_peaks.bed.gz 1663 1331 968 # ## pvalue p.adjust # ## ARmo_0M 0.82352941 0.82352941 # ## ARmo_1nM 0.17647059 0.35294118 # ## ARmo_100nM 0.33333333 0.44444444 # ## CBX6_BF 0.01960784 0.07843137 ## ----get-geo-species---------------------------------------------------------- getGEOspecies() |> head() ## ----get-geo-genome----------------------------------------------------------- getGEOgenomeVersion() |> head() ## ----get-geo-info------------------------------------------------------------- hg19 <- getGEOInfo(genome = "hg19", simplify = TRUE) head(hg19) ## ----download-geo-bed, eval=FALSE--------------------------------------------- # downloadGEObedFiles(genome = "hg19", destDir = "hg19") ## ----download-gsm-bed, eval=FALSE--------------------------------------------- # gsm <- hg19$gsm[sample(nrow(hg19), 10)] # downloadGSMbedFiles(gsm, destDir = "hg19") ## ----session-info, echo=FALSE------------------------------------------------- sessionInfo()