## ----setup, include=TRUE------------------------------------------------------ # Global knitr configuration knitr::opts_chunk$set(echo = TRUE, collapse = TRUE, message = FALSE, warning = FALSE) # Load libraries required for the pipeline workflow library(lncRna) library(rtracklayer) library(GenomicRanges) library(IRanges) library(gprofiler2) library(plotly) library(fmsb) library(patchwork) ## ----create-mock-data--------------------------------------------------------- # 1. Sample GRanges object (mimicking an imported GTF file) mockGtf <- GRanges( seqnames = "chr1", ranges = IRanges(start = c(100, 200, 400, 500), width = c(50, 60, 100, 20)), strand = "+", type = c("exon", "exon", "transcript", "exon"), gene_id = c("G1", "G1", "G2", "G2"), gene_biotype = c("protein_coding", "protein_coding", "lncRNA", "lncRNA"), transcript_id = c("tx1", "tx1", "tx2", "tx2"), transcript_biotype = c("protein_coding", "protein_coding", "lncRNA", "lncRNA"), exon_number = c("1", "2", NA, "1") ) # 2. Partitioning sequences using createTrainTestSets # This ensures we have ground truth sets for performance evaluation all_ids <- c("tx1", "tx2", "tx3", "tx4", "tx5", "tx6") set.seed(123) sets <- createTrainTestSets(sequences = all_ids, percentTrain = 0.5, prefix = "demo") # Define independent test sets (nc = non-coding, cds = coding) ncTestSet <- sets$demo.test cdsTestSet <- sets$demo.train ## ----extract-stats------------------------------------------------------------ # Calculate transcript length and exon count transcriptStats <- getGtfStats(gtfObject = mockGtf) # Extract biotype information at the gene level geneBiotypes <- getBiotypes(refGtf = mockGtf, level = "gene") print(head(transcriptStats)) ## ----aggregate-and-venn------------------------------------------------------- # Mocking aggregated results from CPC2, PLEK, and CPAT mockCodPotList <- list( seqIDs = c("tx1", "tx2", "tx3", "tx4", "tx5", "tx6"), tools = list( CPC2 = c(0, 1, 1, 0, 1, 0), PLEK = c(0, 1, 0, 0, 1, 1), CPAT = c(0, 1, 1, 1, 0, 0) ) ) # Visualize tool agreement (Venn) if (requireNamespace("venn", quietly = TRUE)) { plotVennCodPot(codPot = mockCodPotList) } ## ----evaluation-pipeline------------------------------------------------------ # 1. Prepare evaluation sets by annotating predictions with ground truth evaluationSummary <- prepareEvaluationSets( codPotList = mockCodPotList, ncTest = ncTestSet, cdsTest = cdsTestSet ) # 2. Evaluate performance for each tool combination (e.g., "CPC2+PLEK") combSummary <- evaluateToolCombinations(summaryList = evaluationSummary) # 3. Calculate summary metrics combStats <- bestToolCombination(combinationSummaryList = combSummary) print(combStats) ## ----calculate-cm------------------------------------------------------------- # Filter for methods with Accuracy >= 0.5 (low threshold for demo purposes) # Use metricsData to provide values for filtering allCms <- calculateCM( combinationSummaryList = combSummary, metricsData = combStats, threshold = 0.5, returnOnlyHigh = TRUE, metricToExtract = "Accuracy", printMetricThresholds = TRUE ) ## ----viz-performance, fig.width=10, fig.height=6------------------------------ # Comparison of top combinations (Radar Plot) methodsToPlot <- names(allCms)[1:min(3, length(allCms))] plotRadarMetrics(cmList = allCms, methods = methodsToPlot, plotTitle = "Top Methods Comparison") # Detailed metrics view (Clock Plot) in 'multiple' layout plotClockMetrics(cmList = allCms, plotTitle = "Performance Metrics Breakdown", layout = "multiple") ## ----interactions------------------------------------------------------------- # Identify Cis-interactions (genomic proximity) feelncFile <- tempfile() write.table(data.frame(isBest=1, lncRNA_gene="LNC1", partnerRNA_gene="GENE_A", distance=500), feelncFile, sep="\t", row.names=FALSE) cisInteractions <- findCisInteractions(FEELncClassesFile = feelncFile, maxDist = 1000) # Identify Trans-interactions via expression correlation mockExpr <- matrix(rnorm(50), nrow = 5, dimnames = list(c("LNC1", "LNC2", "GENE_A", "GENE_B", "GENE_C"), paste0("S", 1:10))) mockExpr["LNC1",] <- mockExpr["GENE_A",] + rnorm(10, 0, 0.1) transInteractions <- findTransInteractions(exprMatrix = mockExpr, rval = 0.8, pval = 0.05) ## ----sankey-viz--------------------------------------------------------------- # Sample enrichment result (mimicking gprofiler2::gost) mockGost <- data.frame(term_id="GO:01", term_name="stress response", intersection="GENE_A") # Annotate interactions with functional terms interactionsProcessed <- annotateInteractions(gostResult = mockGost, interactionTable = transInteractions, type = "trans") # Generate interactive Sankey diagram plotSankeyInteractions(interactionData = interactionsProcessed, groupBy = "term", selectIds = "stress response", title = "Functional Interaction Flow") ## ----session-info------------------------------------------------------------- sessionInfo()