## ----knitr options, echo = FALSE, include = FALSE----------------------------- knitr::opts_chunk[["set"]]( collapse = TRUE, comment = "#>", dev = "png", fig.width = 6L, fig.height = 4L, dpi = 72L ) ## ----Preamble, message=FALSE, warning=FALSE----------------------------------- library(COTAN) library(zeallot) # necessary to solve precedence of overloads conflicted::conflict_prefer("%<-%", "zeallot") prevOptState <- options(parallelly.fork.enable = TRUE) ## ----Download data, eval=TRUE, include=TRUE----------------------------------- dataDir <- tempdir() print(dataDir) GEO <- "GSM2861514" fName <- "GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz" dataSetFile <- file.path(dataDir, GEO, fName) dir.create(file.path(dataDir, GEO), showWarnings = FALSE) if (!file.exists(dataSetFile)) { GEOquery::getGEOSuppFiles( GEO, makeDirectory = TRUE, baseDir = dataDir, fetch_files = TRUE, filter_regex = fName ) } sampleDataset <- read.csv(dataSetFile, sep = "\t", row.names = 1L) ## ----Setup log and check for GPU---------------------------------------------- outDir <- dataDir # Log-level 2 was chosen to showcase better how the package works # In normal usage a level of 0 or 1 is more appropriate setLoggingLevel(2L) # This file will contain all the logs produced by the package # as if at the highest logging level setLoggingFile(file.path(outDir, "vignette_v2.log")) message("COTAN uses the `torch` library when asked to `optimizeForSpeed`") message("Run the command 'options(COTAN.UseTorch = FALSE)'", " in your session to disable `torch` completely!") # this command does check whether the torch library is properly installed c(useTorch, deviceStr) %<-% COTAN::canUseTorch(TRUE, "cuda") if (useTorch) { message("The `torch` library is available and ready to use") if (deviceStr == "cuda") { message("The `torch` library can use the `CUDA` GPU") } else { message("The `torch` library can only use the CPU") message("Please ensure you have the `OpenBLAS` libraries", " installed on the system") } } rm(useTorch, deviceStr) ## ----Create the COTAN object-------------------------------------------------- cond <- "mouse_cortex_E17.5" obj <- COTAN(raw = sampleDataset) obj <- initializeMetaDataset( obj, GEO = GEO, sequencingMethod = "Drop_seq", sampleCondition = cond ) logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])), logLevel = 1L) ## ----ECD plot----------------------------------------------------------------- plot(ECDPlot(obj)) ## ----Cells size plot---------------------------------------------------------- plot(cellSizePlot(obj)) ## ----Number of detected genes------------------------------------------------- plot(genesSizePlot(obj)) ## ----Scatter plot: Cells size vs Number of detected genes--------------------- plot(scatterPlot(obj)) ## ----Cull probable doublets or multiplets------------------------------------- cellsSizeThr <- 6000L obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr) cellsToRem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr] obj <- dropGenesCells(obj, cells = cellsToRem) plot(cellSizePlot(obj)) ## ----Cull probable doublets or multiplets - 2--------------------------------- genesSizeHighThr <- 3000L obj <- addElementToMetaDataset(obj, "Num genes high threshold", genesSizeHighThr) cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr] obj <- dropGenesCells(obj, cells = cellsToRem) plot(genesSizePlot(obj)) ## ----Cull probable dead cells------------------------------------------------- genesSizeLowThr <- 500L obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr) cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr] obj <- dropGenesCells(obj, cells = cellsToRem) plot(genesSizePlot(obj)) ## ----Calculate percentage of mitochondrial genes------------------------------ c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt") plot(mitPlot) ## ----Cull probable dead cells - 2--------------------------------------------- mitPercThr <- 1.0 obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr) cellsToRem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr] obj <- dropGenesCells(obj, cells = cellsToRem) c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt") plot(mitPlot) ## ----Drop the mitochondrial genes--------------------------------------------- genesToRem <- getGenes(obj)[grep("^Mt", getGenes(obj))] obj <- dropGenesCells(obj, genes = genesToRem) cellsToRem <- getCells(obj)[getCellsSize(obj) == 0L] obj <- dropGenesCells(obj, cells = cellsToRem) ## ----Log current dataset status----------------------------------------------- logThis(paste("n cells", getNumCells(obj)), logLevel = 1L) ## ----Calculate B group for spurious cells according to PCA-------------------- obj <- addElementToMetaDataset(obj, "Num drop B group", 0L) obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) plot(pcaCellsPlot) ## ----Show most expressed genes of B group------------------------------------- plot(genesPlot) ## ----Cull B group cells and re-calculate, eval=TRUE, include=TRUE------------- cellsToRem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"] obj <- dropGenesCells(obj, cells = cellsToRem) obj <- addElementToMetaDataset(obj, "Num drop B group", 1L) obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) plot(pcaCellsPlot) ## ----Cells efficiency PCA plot------------------------------------------------ plot(UDEPlot) ## ----Cells efficiency plot---------------------------------------------------- plot(nuPlot) ## ----Cells efficiency plot (low tail)----------------------------------------- plot(zoomedNuPlot) ## ----Cull cells with too low efficiency--------------------------------------- udeLowThr <- 0.30 obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", udeLowThr) obj <- addElementToMetaDataset(obj, "Num drop B group", 2L) obj <- estimateNuLinear(obj) cellsToRem <- getCells(obj)[getNu(obj) < udeLowThr] obj <- dropGenesCells(obj, cells = cellsToRem) ## ----Re-calculate clean-plots------------------------------------------------- obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) plot(pcaCellsPlot) ## ----Updated scatter plot----------------------------------------------------- plot(scatterPlot(obj)) ## ----Log updated dataset status----------------------------------------------- logThis(paste("n cells", getNumCells(obj)), logLevel = 1L) ## ----Calculate and store COEX matrix------------------------------------------ obj <- proceedToCoex( obj, calcCoex = TRUE, optimizeForSpeed = TRUE, cores = 6L, deviceStr = "cuda", saveObj = FALSE, outDir = outDir ) ## ----Save COTAN obj to file, eval=FALSE, include=TRUE------------------------- # # saving the structure # saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS"))) ## ----COTAN object construction for no cleaning case, eval=FALSE, include=TRUE---- # obj2 <- # automaticCOTANObjectCreation( # raw = sampleDataset, # GEO = GEO, # sequencingMethod = "Drop_seq", # sampleCondition = cond, # calcCoex = TRUE, # cores = 6L, # optimizeForSpeed = TRUE, # saveObj = TRUE, # outDir = outDir # ) ## ----Calculate and store GDI-------------------------------------------------- gdiDF <- calculateGDI(obj) head(gdiDF) # This will store only the $GDI column obj <- storeGDI(obj, genesGDI = gdiDF) ## ----GDI plot with marked genes----------------------------------------------- genesList <- list( "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"), "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"), "hk" = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a", "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1", "Tars", "Amacr") ) GDIPlot(obj, cond = cond, genes = genesList, GDIThreshold = 1.40) ## ----Heatmap plot of coex for selected genes---------------------------------- plot(heatmapPlot(obj, genesLists = genesList)) ## ----Heatmap plot of coex given primary marker genes, eval=TRUE, include=TRUE---- invisible( genesHeatmapPlot( obj, primaryMarkers = c("Satb2", "Bcl11b", "Vim", "Hes1"), pValueThreshold = 0.001, symmetric = TRUE ) ) ## ----Heatmap plot of coex given two genes sets, eval=FALSE, include=TRUE------ # invisible( # genesHeatmapPlot( # obj, # primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"), # secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"), # pValueThreshold = 0.001, # symmetric = FALSE # ) # ) ## ----Get the contingency tables and COEX for given gene pair------------------ cat("Contingency Tables:") contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b") cat("Corresponding COEX") getGenesCoex(obj)["Satb2", "Bcl11b"] ## ----Display small chunk of the COEX matrix----------------------------------- # For the whole matrix coex <- getGenesCoex(obj, zeroDiagonal = FALSE) coex[1000L:1005L, 1000L:1005L] ## ----Get sub-matrix of the COEX----------------------------------------------- # For a partial matrix coex <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2")) coex[1000L:1005L, ] ## ----Establish genes clusters, eval=TRUE, include=TRUE------------------------ layersGenes <- list( "L1" = c("Reln", "Lhx5"), "L2/3" = c("Satb2", "Cux1"), "L4" = c("Rorb", "Sox5"), "L5/6" = c("Bcl11b", "Fezf2"), "Prog" = c("Vim", "Hes1", "Dummy") ) c(gSpace, eigPlot, pcaGenesClDF, treePlot) %<-% establishGenesClusters( obj, groupMarkers = layersGenes, numGenesPerMarker = 25L, kCuts = 5L ) plot(eigPlot) ## ----Genes tree plot, eval=TRUE, include=TRUE--------------------------------- plot(treePlot) ## ----Genes UMAP plot, eval=TRUE, include=TRUE--------------------------------- colSelection <- vapply(pcaGenesClDF, is.numeric, logical(1L)) genesUmapPl <- UMAPPlot( pcaGenesClDF[, colSelection, drop = FALSE], clusters = getColumnFromDF(pcaGenesClDF, "hclust"), elements = layersGenes, title = "Genes' clusters UMAP Plot", numNeighbors = 32L, minPointsDist = 0.25 ) plot(genesUmapPl) ## ----Create Uniform-Transcript clusterization, eval=FALSE, include=TRUE------- # # This code is a little too computationally heavy to be used in an example # # So we stored the result and we can load it in the next section # # # default constructed checker is OK # advChecker <- new("AdvancedGDIUniformityCheck") # # c(splitClusters, splitCoexDF) %<-% # cellsUniformClustering( # obj, # initialResolution = 0.8, # checker = advChecker, # optimizeForSpeed = TRUE, # deviceStr = "cuda", # cores = 6L, # genesSel = "HGDI", # saveObj = TRUE, # outDir = outDir # ) # # obj <- # addClusterization( # obj, # clName = "split", # clusters = splitClusters, # coexDF = splitCoexDF # ) # # table(splitClusters) ## ----Load pre-calculated clusterization, eval=TRUE, include=TRUE-------------- data("vignette.split.clusters", package = "COTAN") splitClusters <- vignette.split.clusters[getCells(obj)] splitCoexDF <- DEAOnClusters(obj, clusters = splitClusters) obj <- addClusterization( obj, clName = "split", clusters = splitClusters, coexDF = splitCoexDF, override = FALSE ) ## ----Extract clusterization summary, eval=TRUE, include=TRUE------------------ c(summaryData, summaryPlot) %<-% clustersSummaryPlot( obj, clName = "split", plotTitle = "split summary" ) summaryData ## ----Summary as plot, eval=TRUE, include=TRUE--------------------------------- plot(summaryPlot) ## ----More complete clusterization data visualization, eval=TRUE, include=TRUE---- c(splitHeatmap, scoreDF, pValueDF) %<-% clustersMarkersHeatmapPlot( obj, groupMarkers = layersGenes, clName = "split", kCuts = 5L, adjustmentMethod = "holm" ) ComplexHeatmap::draw(splitHeatmap) ## ----Merge potentially Uniform-Transcript clusters, eval=FALSE, include=TRUE---- # # default constructed checker is OK # advChecker <- new("AdvancedGDIUniformityCheck") # # c(mergedClusters, mergedCoexDF) %<-% # mergeUniformCellsClusters( # obj, # clusters = splitClusters, # checkers = advChecker, # optimizeForSpeed = TRUE, # deviceStr = "cuda", # cores = 6L, # saveObj = TRUE, # outDir = outDir # ) # # # merges are: # # 1 <- 06 + 07 # # 2 <- '-1' + 08 + 11 # # 3 <- 09 # # 4 <- 01 # # 5 <- 02 # # 6 <- 12 # # 7 <- 03 + 04 + 10 + 13 # # 8 <- 05 # # obj <- # addClusterization( # obj, # clName = "merge", # override = TRUE, # clusters = mergedClusters, # coexDF = mergedCoexDF # ) # # table(mergedClusters) ## ----Load pre-calculated merged clusterization, eval=FALSE, include=TRUE------ # # checkersList <- # list( # advChecker, # shiftCheckerThresholds(advChecker, 0.01), # shiftCheckerThresholds(advChecker, 0.03) # ) # # prevCheckRes <- data.frame() # # # In this case we want to re-use the already calculated merge checks # # Se we reload them from the output files. This, along a similar facility for # # the split method, is also useful in the cases the execution is interrupted # # prematurely... # # # if (TRUE) { # # read from the last file among those named all_check_results_XX.csv # mergeDir <- file.path(outDir, cond, "leafs_merge") # resFiles <- # list.files( # path = mergeDir, # pattern = "all_check_results_.*csv", # full.names = TRUE # ) # # prevCheckRes <- # read.csv(resFiles[length(resFiles)], header = TRUE, row.names = 1L) # } # # c(mergedClusters2, mergedCoexDF2) %<-% # mergeUniformCellsClusters( # obj, # clusters = splitClusters, # checkers = checkersList, # allCheckResults = prevCheckRes, # optimizeForSpeed = TRUE, # deviceStr = "cuda", # cores = 6L, # saveObj = TRUE, # outDir = outDir # ) # # # merges are: # # 1 <- '-1' + 06 + 09 # # 2 <- 07 # # 3 <- 03 + 04 + 10 + 13 # # 4 <- 12 # # 5 <- 01 + 08 + 11 # # 6 <- 02 + 05 # # obj <- # addClusterization( # obj, # clName = "merge2", # override = TRUE, # clusters = mergedClusters2, # coexDF = mergedCoexDF2 # ) # # table(mergedClusters2) ## ----Calculate and store DEA using clusterizations, eval=TRUE, include=TRUE---- data("vignette.merge.clusters", package = "COTAN") mergedClusters <- vignette.merge.clusters[getCells(obj)] mergedCoexDF <- DEAOnClusters(obj, clusters = mergedClusters) obj <- addClusterization( obj, clName = "merge", clusters = mergedClusters, coexDF = mergedCoexDF, override = FALSE ) data("vignette.merge2.clusters", package = "COTAN") mergedClusters2 <- vignette.merge2.clusters[getCells(obj)] mergedCoexDF2 <- DEAOnClusters(obj, clusters = mergedClusters2) obj <- addClusterization( obj, clName = "merge2", clusters = mergedClusters2, coexDF = mergedCoexDF2, override = FALSE ) table(mergedClusters2, mergedClusters) ## ----Clusterizations data visualization, eval=TRUE, include=TRUE-------------- c(mergeHeatmap, mergeScoresDF, mergePValuesDF) %<-% clustersMarkersHeatmapPlot( obj, clName = "merge", condNameList = "split", conditionsList = list(splitClusters) ) ComplexHeatmap::draw(mergeHeatmap) c(merge2Heatmap, merge2ScoresDF, merge2PValuesDF) %<-% clustersMarkersHeatmapPlot( obj, clName = "merge2", condNameList = "split", conditionsList = list(splitClusters) ) ComplexHeatmap::draw(merge2Heatmap) ## ----Clusterization UMAP plot using Dimensionality Reduction------------------ c(umapPlot, cellsRDM) %<-% cellsUMAPPlot( obj, clName = "merge2", clusters = NULL, useCoexEigen = TRUE, dataMethod = "LogLikelihood", numComp = 50L, genesSel = "HGDI", numGenes = 200L, colors = NULL, numNeighbors = 30L, minPointsDist = 0.3 ) plot(umapPlot) ## ----Clean-up----------------------------------------------------------------- if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) { #Delete file if it exists file.remove(file.path(outDir, paste0(cond, ".cotan.RDS"))) } if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) { #Delete file if it exists file.remove(file.path(outDir, paste0(cond, "_times.csv"))) } if (dir.exists(file.path(outDir, cond))) { unlink(file.path(outDir, cond), recursive = TRUE) } if (dir.exists(file.path(outDir, GEO))) { unlink(file.path(outDir, GEO), recursive = TRUE) } # stop logging to file setLoggingFile("") file.remove(file.path(outDir, "vignette_v2.log")) options(prevOptState) ## ----Closing------------------------------------------------------------------ Sys.time() sessionInfo()