## ----echo=FALSE, message=FALSE, warning=FALSE------------------------------ suppressPackageStartupMessages({ library(AUCell) #library(Biobase) library(GSEABase) library(data.table) library(DT) library(NMF) library(plotly) library(GEOquery) # library(doMC);library(doRNG) # Loaded by AUCell, to avoid messages. Not available in windows? }) # To build a personalized report, update this working directory: dir.create("AUCell_tutorial") knitr::opts_knit$set(root.dir = 'AUCell_tutorial') ## ----Overview, eval=FALSE-------------------------------------------------- # library(AUCell) # cells_rankings <- AUCell_buildRankings(exprMatrix) # # genes <- c("gene1", "gene2", "gene3") # geneSets <- list(geneSet1=genes) # # geneSets <- GSEABase::GeneSet(genes, setName="geneSet1") # alternative # cells_AUC <- AUCell_calcAUC(geneSets, cells_rankings, aucMaxRank=nrow(cells_rankings)*0.05) # # par(mfrow=c(3,3)) # set.seed(123) # cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist=TRUE, nCores=1, assign=TRUE) ## ----citation, echo=FALSE-------------------------------------------------- print(citation("AUCell")[1], style="textVersion") ## ----setup, eval=FALSE----------------------------------------------------- # if (!requireNamespace("BiocManager", quietly=TRUE)) # install.packages("BiocManager") # # To support paralell execution: # BiocManager::install(c("doMC", "doRNG")) # # For the main example: # BiocManager::install(c("mixtools", "GEOquery", "SummarizedExperiment")) # # For the examples in the follow-up section of the tutorial: # BiocManager::install(c("DT", "plotly", "NMF", "d3heatmap", "shiny", "rbokeh")) ## ----vignette, eval=FALSE-------------------------------------------------- # # Explore tutorials in the web browser: # browseVignettes(package="AUCell") # # # Commnad line-based: # vignette(package="AUCell") # list # vignette("AUCell") # open ## ----editRmd, eval=FALSE--------------------------------------------------- # vignetteFile <- paste(file.path(system.file('doc', package='AUCell')), "AUCell.Rmd", sep="/") # # Copy to edit as markdown # file.copy(vignetteFile, ".") # # Alternative: extract R code # Stangle(vignetteFile) ## ----setwd, eval=FALSE----------------------------------------------------- # dir.create("AUCell_tutorial") # setwd("AUCell_tutorial") # or in the first code chunk (kntr options), if running as Notebook ## ----loadingExprMat, eval=FALSE-------------------------------------------- # # i.e. Reading from a text file # exprMatrix <- read.table("myCountsMatrix.tsv") # exprMatrix <- as.matrix(exprMatrix) # # # or using an expression set # exprMatrix <- exprs(myExpressionSet) ## ----GEOdataset, cache=TRUE, results='hide', message=FALSE, eval=TRUE------ # (This may take a few minutes) library(GEOquery) attemptsLeft <- 20 while(attemptsLeft>0) { geoFile <- tryCatch(getGEOSuppFiles("GSE60361", makeDirectory=FALSE), error=identity) if("error" %in% class(geoFile)) { attemptsLeft <- attemptsLeft-1 Sys.sleep(5) } else attemptsLeft <- 0 } gzFile <- grep(".txt.gz", basename(rownames(geoFile)), fixed=TRUE, value=TRUE) message(gzFile) txtFile <- gsub(".gz", "", gzFile, fixed=TRUE) message(txtFile) gunzip(filename=gzFile, destname=txtFile, remove=TRUE) library(data.table) geoData <- fread(txtFile, sep="\t") geneNames <- unname(unlist(geoData[,1, with=FALSE])) exprMatrix <- as.matrix(geoData[,-1, with=FALSE]) rm(geoData) dim(exprMatrix) rownames(exprMatrix) <- geneNames exprMatrix[1:5,1:4] # Remove file file.remove(txtFile) # Save for future use mouseBrainExprMatrix <- exprMatrix save(mouseBrainExprMatrix, file="exprMatrix_AUCellVignette_MouseBrain.RData") ## ----randomSamples--------------------------------------------------------- # load("exprMatrix_AUCellVignette_MouseBrain.RData") set.seed(333) exprMatrix <- mouseBrainExprMatrix[sample(rownames(mouseBrainExprMatrix), 5000),] ## ----dimExprMat------------------------------------------------------------ dim(exprMatrix) ## ----geneSetsFake---------------------------------------------------------- library(GSEABase) genes <- c("gene1", "gene2", "gene3") geneSets <- GeneSet(genes, setName="geneSet1") geneSets ## ----geneSets-------------------------------------------------------------- library(AUCell) library(GSEABase) gmtFile <- paste(file.path(system.file('examples', package='AUCell')), "geneSignatures.gmt", sep="/") geneSets <- getGmt(gmtFile) ## ----geneSetsNgenes-------------------------------------------------------- geneSets <- subsetGeneSets(geneSets, rownames(exprMatrix)) cbind(nGenes(geneSets)) ## ----geneSetsRename-------------------------------------------------------- geneSets <- setGeneSetNames(geneSets, newNames=paste(names(geneSets), " (", nGenes(geneSets) ,"g)", sep="")) ## ----hkGs------------------------------------------------------------------ # Random set.seed(321) extraGeneSets <- c( GeneSet(sample(rownames(exprMatrix), 50), setName="Random (50g)"), GeneSet(sample(rownames(exprMatrix), 500), setName="Random (500g)")) countsPerGene <- apply(exprMatrix, 1, function(x) sum(x>0)) # Housekeeping-like extraGeneSets <- c(extraGeneSets, GeneSet(sample(names(countsPerGene)[which(countsPerGene>quantile(countsPerGene, probs=.95))], 100), setName="HK-like (100g)")) geneSets <- GeneSetCollection(c(geneSets,extraGeneSets)) names(geneSets) ## ----buildRankings, cache=TRUE, fig.width=5, fig.height=5------------------ cells_rankings <- AUCell_buildRankings(exprMatrix, nCores=1, plotStats=TRUE) ## -------------------------------------------------------------------------- cells_rankings ## ----saveRankings, eval=FALSE---------------------------------------------- # save(cells_rankings, file="cells_rankings.RData") ## ----explainAUC, echo=FALSE------------------------------------------------ geneSet <- geneSets[[1]] gSetRanks <- cells_rankings[geneIds(geneSet),] par(mfrow=c(1,2)) set.seed(222) aucMaxRank <- nrow(cells_rankings)*0.05 na <- sapply(sample(1:3005, 2), function(i){ x <- sort(getRanking(gSetRanks[,i])) aucCurve <- cbind(c(0, x, nrow(cells_rankings)), c(0:length(x), length(x))) op <- par(mar=c(5, 6, 4, 2) + 0.1) plot(aucCurve, type="s", col="darkblue", lwd=1, xlab="Gene rank", ylab="# genes in the gene set \n Gene set: Astrocyte markers", xlim=c(0, aucMaxRank*2), ylim=c(0, nGenes(geneSet)*.20), main="Recovery curve", sub=paste("Cell:", colnames(gSetRanks)[i])) aucShade <- aucCurve[which(aucCurve[,1] < aucMaxRank),] aucShade <- rbind(aucShade, c(aucMaxRank, nrow(aucShade))) aucShade[,1] <- aucShade[,1]-1 aucShade <- rbind(aucShade, c(max(aucShade),0)) polygon(aucShade, col="#0066aa40", border=FALSE) abline(v=aucMaxRank, lty=2) text(aucMaxRank-50, 5, "AUC") }) ## ----calcAUC, cache=TRUE, warning=FALSE------------------------------------ cells_AUC <- AUCell_calcAUC(geneSets, cells_rankings) save(cells_AUC, file="cells_AUC.RData") ## ----exploreThresholds, warning=FALSE, fig.width=7, fig.height=7----------- set.seed(123) par(mfrow=c(3,3)) cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist=TRUE, assign=TRUE) warningMsg <- sapply(cells_assignment, function(x) x$aucThr$comment) warningMsg[which(warningMsg!="")] ## ----explThr1-------------------------------------------------------------- cells_assignment$Oligodendrocyte_Cahoy$aucThr$thresholds ## ----explThr2-------------------------------------------------------------- cells_assignment$Oligodendrocyte_Cahoy$aucThr$selected # getThresholdSelected(cells_assignment) ## ----cellsAssigned--------------------------------------------------------- oligodencrocytesAssigned <- cells_assignment$Oligodendrocyte_Cahoy$assignment length(oligodencrocytesAssigned) head(oligodencrocytesAssigned) ## ----AUCell_plot----------------------------------------------------------- geneSetName <- rownames(cells_AUC)[grep("Oligodendrocyte_Cahoy", rownames(cells_AUC))] AUCell_plotHist(cells_AUC[geneSetName,], aucThr=0.25) abline(v=0.25) ## ----explThr3-------------------------------------------------------------- newSelectedCells <- names(which(getAUC(cells_AUC)[geneSetName,]>0.08)) length(newSelectedCells) head(newSelectedCells) ## ----explAssignment-------------------------------------------------------- cellsAssigned <- lapply(cells_assignment, function(x) x$assignment) assignmentTable <- reshape2::melt(cellsAssigned, value.name="cell") colnames(assignmentTable)[2] <- "geneSet" head(assignmentTable) ## ----assignmentMat--------------------------------------------------------- assignmentMat <- table(assignmentTable[,"geneSet"], assignmentTable[,"cell"]) assignmentMat[,1:2] ## ----assignHeatmap--------------------------------------------------------- set.seed(123) miniAssigMat <- assignmentMat[,sample(1:ncol(assignmentMat),100)] library(NMF) aheatmap(miniAssigMat, scale="none", color="black", legend=FALSE) ## ----assignHeatmap_interactive, eval=FALSE--------------------------------- # library(d3heatmap) # d3heatmap(matrix(miniAssigMat), scale="none", colors=c("white", "black")) # # library(DT) # datatable(assignmentTable, options = list(pageLength = 10), filter="top") ## ----loadtSNE, fig.width=4, fig.height=4----------------------------------- # Load the tSNE (included in the package) load(paste(file.path(system.file('examples', package='AUCell')), "cellsTsne.RData", sep="/")) cellsTsne <- cellsTsne$Y plot(cellsTsne, pch=16, cex=.3) ## ----runTsne, eval=FALSE--------------------------------------------------- # load("exprMatrix_AUCellVignette_MouseBrain.RData") # sumByGene <- apply(mouseBrainExprMatrix, 1, sum) # exprMatSubset <- mouseBrainExprMatrix[which(sumByGene>0),] # logMatrix <- log2(exprMatSubset+1) # # library(Rtsne) # set.seed(123) # cellsTsne <- Rtsne(t(logMatrix)) # rownames(cellsTsne$Y) <- colnames(logMatrix) # colnames(cellsTsne$Y) <- c("tsne1", "tsne2") # save(cellsTsne, file="cellsTsne.RData") ## ----plotTsneCode, fig.width=7, fig.height=6------------------------------- selectedThresholds <- getThresholdSelected(cells_assignment) par(mfrow=c(2,3)) # Splits the plot into two rows and three columns for(geneSetName in names(selectedThresholds)) { nBreaks <- 5 # Number of levels in the color palettes # Color palette for the cells that do not pass the threshold colorPal_Neg <- grDevices::colorRampPalette(c("black","blue", "skyblue"))(nBreaks) # Color palette for the cells that pass the threshold colorPal_Pos <- grDevices::colorRampPalette(c("pink", "magenta", "red"))(nBreaks) # Split cells according to their AUC value for the gene set passThreshold <- getAUC(cells_AUC)[geneSetName,] > selectedThresholds[geneSetName] if(sum(passThreshold) >0 ) { aucSplit <- split(getAUC(cells_AUC)[geneSetName,], passThreshold) # Assign cell color cellColor <- c(setNames(colorPal_Neg[cut(aucSplit[[1]], breaks=nBreaks)], names(aucSplit[[1]])), setNames(colorPal_Pos[cut(aucSplit[[2]], breaks=nBreaks)], names(aucSplit[[2]]))) # Plot plot(cellsTsne, main=geneSetName, sub="Pink/red cells pass the threshold", col=cellColor[rownames(cellsTsne)], pch=16) } } ## ----tsneThreshold--------------------------------------------------------- selectedThresholds[2] <- 0.25 par(mfrow=c(2,3)) AUCell_plotTSNE(tSNE=cellsTsne, exprMat=exprMatrix, cellsAUC=cells_AUC[1:2,], thresholds=selectedThresholds) ## ----eval=FALSE------------------------------------------------------------ # library(shiny); library(rbokeh) # exprMat <- log2(mouseBrainExprMatrix+1) # (back to the full matrix) # # # Create app # aucellApp <- AUCell_createViewerApp(auc=cells_AUC, thresholds=selectedThresholds, # tSNE=cellsTsne, exprMat=exprMat) # # # Run (the exact commands depend on the R settings, see Shiny's doc for help) # options(shiny.host="0.0.0.0") # savedSelections <- runApp(aucellApp) # # # Other common settings: # # host = getOption("shiny.host", "127.0.0.1") # # shinyApp(ui=aucellApp$ui, server=aucellApp$server) # # # Optional: Visualize extra information about the cells (categorical variable) # cellInfoFile <- paste(file.path(system.file('examples', package='AUCell')), # "mouseBrain_cellLabels.tsv", sep="/") # cellInfo <- read.table(cellInfoFile, row.names=1, header=TRUE, sep="\t") # head(cellInfo) ## ----tSNE_interactive, eval=FALSE------------------------------------------ # geneSetName <- "Astrocyte_Cahoy (555g)" # # library(rbokeh) # tSNE.df <- data.frame(cellsTsne, cell=rownames(cellsTsne)) # figure() %>% # ly_points(tsne1, tsne2, data=tSNE.df, size=1, legend = FALSE, # hover=cell, color=getAUC(cells_AUC)[geneSetName,rownames(tSNE.df)]) %>% # set_palette(continuous_color = pal_gradient(c("lightgrey", "pink", "red"))) ## -------------------------------------------------------------------------- logMat <- log2(exprMatrix+2) meanByGs <- t(sapply(geneSets, function(gs) colMeans(logMat[geneIds(gs),]))) rownames(meanByGs) <- names(geneSets) ## ----meanTsne, fig.width=7, fig.height=8----------------------------------- colorPal <- grDevices::colorRampPalette(c("black", "red"))(nBreaks) par(mfrow=c(3,3)) for(geneSetName in names(geneSets)) { cellColor <- setNames(colorPal[cut(meanByGs[geneSetName,], breaks=nBreaks)], names(meanByGs[geneSetName,])) plot(cellsTsne, main=geneSetName, axes=FALSE, xlab="", ylab="", sub="Expression mean", col=cellColor[rownames(cellsTsne)], pch=16) } AUCell_plotTSNE(tSNE=cellsTsne, exprMat=exprMatrix, plots = "AUC", cellsAUC=cells_AUC[1:9,], thresholds=selectedThresholds) ## ---- fig.height=4, fig.width=3.5------------------------------------------ nGenesPerCell <- apply(exprMatrix, 2, function(x) sum(x>0)) colorPal <- grDevices::colorRampPalette(c("darkgreen", "yellow","red")) cellColorNgenes <- setNames(adjustcolor(colorPal(10), alpha.f=.8)[as.numeric(cut(nGenesPerCell,breaks=10, right=FALSE,include.lowest=TRUE))], names(nGenesPerCell)) plot(cellsTsne, axes=FALSE, xlab="", ylab="", sub="Number of detected genes", col=cellColorNgenes[rownames(cellsTsne)], pch=16) ## -------------------------------------------------------------------------- # "Real" cell type (e.g. provided in the publication) cellLabels <- paste(file.path(system.file('examples', package='AUCell')), "mouseBrain_cellLabels.tsv", sep="/") cellLabels <- read.table(cellLabels, row.names=1, header=TRUE, sep="\t") ## -------------------------------------------------------------------------- # Confusion matrix: cellTypeNames <- unique(cellLabels[,"level1class"]) confMatrix <- t(sapply(cells_assignment[c(1,4,2,5,3,6:9)], function(x) table(cellLabels[x$assignment,])[cellTypeNames])) colnames(confMatrix) <- cellTypeNames confMatrix[which(is.na(confMatrix), arr.ind=TRUE)] <- 0 confMatrix ## -------------------------------------------------------------------------- date() sessionInfo()