## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 4.5 ) library(SpliceImpactR) library(BiocParallel) ## ----install-biocmanager, eval=FALSE------------------------------------------ # if (!requireNamespace("BiocManager", quietly = TRUE)) { # install.packages("BiocManager") # } # BiocManager::install("SpliceImpactR") ## ----install-devtools, eval=FALSE--------------------------------------------- # if (!requireNamespace("devtools", quietly = TRUE)) { # install.packages("devtools") # } # devtools::install_github("fiszbein-lab/SpliceImpactR") ## ----load-resources----------------------------------------------------------- library(SpliceImpactR) annotation_df <- get_annotation(load = "test") interpro_features <- get_protein_features( biomaRt_databases = c("interpro"), gtf_df = annotation_df$annotations, test = TRUE ) signalp_features <- get_protein_features( biomaRt_databases = c("signalp"), gtf_df = annotation_df$annotations, test = TRUE ) elm_features <- get_protein_features( biomaRt_databases = c("elm"), gtf_df = annotation_df$annotations, test = TRUE ) protein_feature_total <- get_comprehensive_annotations( list(interpro_features, signalp_features, elm_features) ) exon_features <- get_exon_features( annotation_df$annotations, protein_feature_total ) ## ----feature-summary---------------------------------------------------------- table(protein_feature_total$database) ## ----load-data---------------------------------------------------------------- sample_frame <- data.frame( path = c( file.path(system.file("extdata", package = "SpliceImpactR"), "rawData/control_S5/"), file.path(system.file("extdata", package = "SpliceImpactR"), "rawData/control_S6/"), file.path(system.file("extdata", package = "SpliceImpactR"), "rawData/control_S7/"), file.path(system.file("extdata", package = "SpliceImpactR"), "rawData/control_S8/"), file.path(system.file("extdata", package = "SpliceImpactR"), "rawData/case_S1/"), file.path(system.file("extdata", package = "SpliceImpactR"), "rawData/case_S2/"), file.path(system.file("extdata", package = "SpliceImpactR"), "rawData/case_S3/"), file.path(system.file("extdata", package = "SpliceImpactR"), "rawData/case_S4/") ), sample_name = c("S5", "S6", "S7", "S8", "S1", "S2", "S3", "S4"), condition = c( "control", "control", "control", "control", "case", "case", "case", "case" ), stringsAsFactors = FALSE ) ## ----wrapper-quickstart, eval=FALSE------------------------------------------- # # data.table-first return # out_dt <- get_splicing_impact( # sample_frame = sample_frame, # source_data = "both", # annotation_df = annotation_df, # protein_feature_total = protein_feature_total, # exon_features = exon_features, # return_class = "data.table" # ) # # # S4-first return (single object carries all slots) # out_s4 <- get_splicing_impact( # sample_frame = sample_frame, # source_data = "both", # annotation_df = annotation_df, # protein_feature_total = protein_feature_total, # exon_features = exon_features, # return_class = "S4" # ) ## ----read-splicing-events----------------------------------------------------- data <- get_rmats_hit( sample_frame, event_types = c("ALE", "AFE", "MXE", "SE", "A3SS", "A5SS", "RI") ) DT <- data.table::as.data.table(data) DT[, .( n_rows = .N, n_events = data.table::uniqueN(event_id), n_genes = data.table::uniqueN(gene_id) )] ## ----load-data-separate------------------------------------------------------- sample_frame_rmats <- sample_frame sample_frame_hit <- sample_frame rmats_only <- get_rmats( load_rmats( sample_frame_rmats, use = "JCEC", event_types = c("MXE", "SE", "A3SS", "A5SS", "RI") ) ) hit_only <- get_hitindex( sample_frame_hit, keep_annotated_first_last = TRUE ) shared_cols <- intersect(names(rmats_only), names(hit_only)) data_separate <- data.table::rbindlist( list( rmats_only[, ..shared_cols], hit_only[, ..shared_cols] ), use.names = TRUE, fill = TRUE ) data_separate[, .( n_rows = .N, n_events = data.table::uniqueN(event_id), n_genes = data.table::uniqueN(gene_id) )] ## ----hitindex-compare, fig.cap="HITindex condition comparison."--------------- hit_compare <- compare_hit_index( sample_frame, condition_map = c(control = "control", test = "case") ) hit_compare$plot ## ----hitindex-table----------------------------------------------------------- head(hit_compare$results[order(fdr)], 6) ## ----overview-psi, fig.cap="Depth-normalized event counts and PSI overview."---- overview_plot <- overview_spicing_comparison( events = data, sample_df = sample_frame, depth_norm = "exon_files", event_type = "AFE" ) overview_plot ## ----differential-inclusion--------------------------------------------------- res <- get_differential_inclusion( data, min_total_reads = 10, parallel_glm = TRUE, BPPARAM = BiocParallel::SerialParam() ) res_di <- keep_sig_pairs(res) res[, .( n_rows = .N, n_sig = sum(padj <= 0.05 & abs(delta_psi) >= 0.1, na.rm = TRUE) )] ## ----di-volcano, fig.cap="Differential inclusion volcano plot."--------------- plot_di_volcano_dt(res) ## ----matching-and-pairs------------------------------------------------------- matched <- get_matched_events_chunked( res_di, annotation_df$annotations, chunk_size = 2000 ) hits_sequences <- attach_sequences(matched, annotation_df$sequences) pairs <- get_pairs(hits_sequences, source = "multi") pairs[, .( n_pairs = .N, n_events = data.table::uniqueN(event_id) )] ## ----proximal-distal, fig.cap="Proximal versus distal terminal exon summary."---- proximal_output <- get_proximal_shift_from_hits(pairs) proximal_output$plot ## ----sequence-compare--------------------------------------------------------- seq_compare <- compare_sequence_frame(pairs, annotation_df$annotations) ## ----alignment-summary, fig.cap="Protein alignment summary by event type."---- alignment_plot <- plot_alignment_summary(seq_compare) alignment_plot ## ----length-summary, fig.cap="Case versus control isoform length comparison."---- length_plot <- plot_length_comparison(seq_compare) length_plot ## ----domain-background-and-hits----------------------------------------------- bg <- get_background( source = "annotated", annotations = annotation_df$annotations, protein_features = protein_feature_total, BPPARAM = BiocParallel::SerialParam() ) hits_domain <- get_domains(seq_compare, exon_features) hits_domain[, .( n_hits = .N, n_domain_changed = sum(diff_n > 0, na.rm = TRUE) )] ## ----domain-enrichment, fig.cap="Top enriched InterPro domain differences."---- enriched_domains <- enrich_domains_hypergeo( hits = hits_domain, background = bg, db_filter = "interpro" ) domain_plot <- plot_enriched_domains_counts(enriched_domains, top_n = 20) domain_plot ## ----domain-enrichment-by-event-db-------------------------------------------- enriched_afe_interpro <- enrich_by_event( hits = hits_domain, background = bg, events = "AFE", db_filter = "interpro" ) enriched_interpro_only <- enrich_by_db( hits = hits_domain, background = bg, dbs = "interpro" ) list( by_event_rows = nrow(enriched_afe_interpro), by_db_rows = nrow(enriched_interpro_only) ) ## ----ppi-effects, fig.cap="Summary of predicted case/control PPI changes."---- ppi <- get_ppi_interactions() hits_final <- get_ppi_switches( hits_domain, ppi, protein_feature_total ) ppi_plot <- plot_ppi_summary(hits_final) ppi_plot ## ----enrichment-foregrounds--------------------------------------------------- fg_di <- get_gene_enrichment( mode = "di", res = res, padj_threshold = 0.1, delta_psi_threshold = 0.1 ) fg_domain <- get_gene_enrichment(mode = "domain", hits = hits_domain) fg_ppi <- get_gene_enrichment(mode = "ppi", hits = hits_final) lengths(list(di = fg_di, domain = fg_domain, ppi = fg_ppi)) intersect(fg_di, fg_ppi) ## ----run-get-enrichment------------------------------------------------------- needed <- c("clusterProfiler", "msigdbr", "org.Hs.eg.db") has_needed <- all(vapply( needed, requireNamespace, FUN.VALUE = logical(1), quietly = TRUE )) if (has_needed) { enrichment_di <- get_enrichment( foreground = fg_domain, background = bg$gene_id, species = "human", gene_id_type = "ensembl", sources = "GO:BP", p_adjust_cutoff = 1, ## CHANGE FOR NON-TOY DATA USAGE, eg: 0.05 top_n_plot = 1 ## CHANGE FOR NON-TOY DATA USAGE, eg: NULL or 10 ) enrichment_di$plot } else { message( "Skipping get_enrichment(): install ", paste(needed, collapse = ", ") ) } ## ----transcript-protein-views, fig.cap="Transcript and protein views for one paired event."---- viz_pair <- hits_final[ !is.na(transcript_id_case) & !is.na(transcript_id_control) & transcript_id_case != "" & transcript_id_control != "" ][1] if (nrow(viz_pair) == 0) { stop("No transcript pairs available for visualization.") } tx_pair <- c(viz_pair$transcript_id_case, viz_pair$transcript_id_control) transcript_centric <- plot_two_transcripts_with_domains_unified( transcripts = tx_pair, gtf_df = annotation_df$annotations, protein_features = protein_feature_total, feature_db = c("interpro"), combine_domains = TRUE, view = "transcript" ) protein_centric <- plot_two_transcripts_with_domains_unified( transcripts = tx_pair, gtf_df = annotation_df$annotations, protein_features = protein_feature_total, feature_db = c("interpro"), combine_domains = TRUE, view = "protein" ) transcript_centric protein_centric ## ----probe-event, fig.cap="PSI distribution for one selected event."---------- event_to_probe <- res$event_id[1] probe <- probe_individual_event(data, event = event_to_probe) probe$plot ## ----probe-table-------------------------------------------------------------- head(probe$data) ## ----integrated-summary, fig.cap="Integrated multi-layer summary.", fig.width=14, fig.height=10---- int_summary <- integrated_event_summary(hits_final, res) int_summary$plot ## ----integrated-summary-table------------------------------------------------- int_summary$summaries$relative_use ## ----s4-init------------------------------------------------------------------ obj <- as_splice_impact_result( data = data, res = res, res_di = res_di, matched = hits_sequences, sample_frame = sample_frame, hits_final = hits_final ) ## ----s4-accessors------------------------------------------------------------- raw_dt <- as_dt_from_s4(obj, "raw_events") di_dt <- as_dt_from_s4(obj, "di_events") hits_dt <- as_dt_from_s4(obj, "paired_hits") hits_core <- get_hits_core(obj) hits_domain_view <- get_hits_domain(obj) hits_ppi_view <- get_hits_ppi(obj) hits_sequence_view <- get_hits_sequence(obj) c( raw_rows = nrow(raw_dt), di_rows = nrow(di_dt), hits_rows = nrow(hits_dt), hits_core_rows = nrow(hits_core), hits_domain_rows = nrow(hits_domain_view), hits_ppi_rows = nrow(hits_ppi_view), hits_sequence_rows = nrow(hits_sequence_view) ) ## ----s4-filtering------------------------------------------------------------- obj_focus <- filter_spliceimpact_hits( obj, diff_n > 0, n_ppi > 0 ) focus_hits <- as_dt_from_s4(obj_focus, "paired_hits") focus_hits[, .N, by = event_type][order(-N)] ## ----s4-enrichment------------------------------------------------------------ fg_di_s4 <- get_gene_enrichment( mode = "di", x = obj, padj_threshold = 0.05, delta_psi_threshold = 0.1 ) fg_ppi_s4 <- get_gene_enrichment(mode = "ppi", x = obj) fg_ppi_focus <- get_gene_enrichment(mode = "ppi", x = obj_focus) list( di_equal_table = identical(sort(fg_di), sort(fg_di_s4)), ppi_equal_table = identical(sort(fg_ppi), sort(fg_ppi_s4)), focused_ppi_genes = head(fg_ppi_focus) ) ## ----s4-schema---------------------------------------------------------------- spliceimpact_s4_schema() ## ----s4-main-workflow-code-only, eval=FALSE----------------------------------- # obj_flow <- as_splice_impact_result( # data = data, # sample_frame = sample_frame # ) # # obj_flow <- get_differential_inclusion( # obj_flow, # min_total_reads = 10, # parallel_glm = TRUE, # BPPARAM = BiocParallel::SerialParam(), # return_class = "S4" # ) # obj_flow <- keep_sig_pairs(obj_flow, return_class = "S4") # # obj_flow <- get_matched_events_chunked( # obj_flow, # annotation_df$annotations, # chunk_size = 2000, # return_class = "S4" # ) # obj_flow <- attach_sequences( # obj_flow, # annotation_df$sequences, # return_class = "S4" # ) # obj_flow <- get_pairs(obj_flow, source = "multi", return_class = "S4") # obj_flow <- compare_sequence_frame( # obj_flow, # annotation_df$annotations, # return_class = "S4" # ) # obj_flow <- get_domains(obj_flow, exon_features, return_class = "S4") # obj_flow <- get_ppi_switches( # obj_flow, # ppi, # protein_feature_total, # return_class = "S4" # ) # # hits_core_flow <- get_hits_core(obj_flow) # hits_domain_flow <- get_hits_domain(obj_flow) # hits_ppi_flow <- get_hits_ppi(obj_flow) # hits_sequence_flow <- get_hits_sequence(obj_flow) ## ----custom-manual-features--------------------------------------------------- ann_dt <- data.table::as.data.table(annotation_df$annotations) coding_tx <- unique(ann_dt[type == "exon" & cds_has == TRUE, transcript_id]) n_manual <- min(3L, length(coding_tx)) if (n_manual < 1L) { stop("No coding transcripts found for manual feature example.") } manual_df <- data.frame( ensembl_transcript_id = coding_tx[seq_len(n_manual)], ensembl_peptide_id = rep(NA_character_, n_manual), name = paste0("demo_feature_", seq_len(n_manual)), start = c(20L, 45L, 80L)[seq_len(n_manual)], stop = c(35L, 58L, 92L)[seq_len(n_manual)], database = rep("manual", n_manual), alt_name = rep(NA_character_, n_manual), feature_id = rep(NA_character_, n_manual), stringsAsFactors = FALSE ) manual_features <- get_manual_features( manual_features = manual_df, gtf_df = annotation_df$annotations ) head(manual_features) ## ----custom-pre-di------------------------------------------------------------ example_df <- data.frame( event_id = rep("A3SS:1", 8), event_type = "A3SS", form = rep(c("INC", "EXC"), each = 4), gene_id = "ENSG00000158286", chr = "chrX", strand = "-", inc = c(rep("149608626-149608834", 4), rep("149608626-149608829", 4)), exc = c(rep("", 4), rep("149608830-149608834", 4)), inclusion_reads = c(30, 32, 29, 31, 2, 3, 4, 3), exclusion_reads = c(1, 1, 2, 1, 28, 27, 26, 30), sample = c("S1", "S2", "S3", "S4", "S1", "S2", "S3", "S4"), condition = rep(c("case", "case", "control", "control"), 2), stringsAsFactors = FALSE ) example_df$psi <- example_df$inclusion_reads / (example_df$inclusion_reads + example_df$exclusion_reads) user_data <- get_user_data(example_df) head(user_data) ## ----custom-post-di----------------------------------------------------------- example_user_data <- data.frame( event_id = rep("A3SS:1", 8), event_type = "A3SS", gene_id = "ENSG00000158286", chr = "chrX", strand = "-", form = rep(c("INC", "EXC"), each = 4), inc = c(rep("149608626-149608834", 4), rep("149608626-149608829", 4)), exc = c(rep("", 4), rep("149608830-149608834", 4)), stringsAsFactors = FALSE ) user_res <- get_user_data_post_di(example_user_data) head(user_res) ## ----custom-rmats-post-di----------------------------------------------------- rmats_df <- data.frame( ID = 1L, GeneID = "ENSG00000182871", geneSymbol = "COL18A1", chr = "chr21", strand = "+", longExonStart_0base = 45505834L, longExonEnd = 45505966L, shortES = 45505837L, shortEE = 45505966L, flankingES = 45505357L, flankingEE = 45505431L, ID.2 = 2L, IJC_SAMPLE_1 = "1,1,1", SJC_SAMPLE_1 = "1,1,1", IJC_SAMPLE_2 = "1,1,1", SJC_SAMPLE_2 = "1,1,1", IncFormLen = 52L, SkipFormLen = 49L, PValue = 0.6967562, FDR = 1, IncLevel1 = "0.0,0.0,0.0", IncLevel2 = "1.0,1.0,1.0", IncLevelDifference = 1.0, stringsAsFactors = FALSE ) user_res_rmats <- get_rmats_post_di(rmats_df, event_type = "A3SS") head(user_res_rmats) ## ----custom-transcript-pairs-------------------------------------------------- tx_ids <- unique(annotation_df$annotations$transcript_id) tx_ids <- tx_ids[!is.na(tx_ids) & tx_ids != ""] if (length(tx_ids) < 4L) { stop("Need at least four transcripts for transcript-pair example.") } transcript_pairs <- data.frame( transcript1 = tx_ids[1:2], transcript2 = tx_ids[3:4], stringsAsFactors = FALSE ) user_matched <- compare_transcript_pairs( transcript_pairs = transcript_pairs, annotations = annotation_df$annotations ) head(user_matched) ## ----------------------------------------------------------------------------- sessionInfo()