## ----style, echo=FALSE, results='hide', message=FALSE---------------------- library(BiocStyle) library(knitr) opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) opts_chunk$set(fig.asp=1) ## -------------------------------------------------------------------------- library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) grun.fname <- bfcrpath(bfc, file.path("ftp://ftp.ncbi.nlm.nih.gov/geo/series", "GSE81nnn/GSE81076/suppl/GSE81076%5FD2%5F3%5F7%5F10%5F17%2Etxt%2Egz")) ## -------------------------------------------------------------------------- gse81076.df <- read.table(grun.fname, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) dim(gse81076.df) ## -------------------------------------------------------------------------- donor.names <- sub("^(D[0-9]+).*", "\\1", colnames(gse81076.df)) table(donor.names) plate.id <- sub("^D[0-9]+(.*)_.*", "\\1", colnames(gse81076.df)) table(plate.id) ## -------------------------------------------------------------------------- gene.symb <- gsub("__chr.*$", "", rownames(gse81076.df)) is.spike <- grepl("^ERCC-", gene.symb) table(is.spike) library(org.Hs.eg.db) gene.ids <- mapIds(org.Hs.eg.db, keys=gene.symb, keytype="SYMBOL", column="ENSEMBL") gene.ids[is.spike] <- gene.symb[is.spike] keep <- !is.na(gene.ids) & !duplicated(gene.ids) gse81076.df <- gse81076.df[keep,] rownames(gse81076.df) <- gene.ids[keep] summary(keep) ## -------------------------------------------------------------------------- library(SingleCellExperiment) sce.gse81076 <- SingleCellExperiment(list(counts=as.matrix(gse81076.df)), colData=DataFrame(Donor=donor.names, Plate=plate.id), rowData=DataFrame(Symbol=gene.symb[keep])) isSpike(sce.gse81076, "ERCC") <- grepl("^ERCC-", rownames(gse81076.df)) sce.gse81076 ## -------------------------------------------------------------------------- library(scater) sce.gse81076 <- calculateQCMetrics(sce.gse81076, compact=TRUE) QC <- sce.gse81076$scater_qc low.lib <- isOutlier(QC$all$log10_total_counts, type="lower", nmad=3) low.genes <- isOutlier(QC$all$log10_total_features_by_counts, type="lower", nmad=3) high.spike <- isOutlier(QC$feature_control_ERCC$pct_counts, type="higher", nmad=3) data.frame(LowLib=sum(low.lib), LowNgenes=sum(low.genes), HighSpike=sum(high.spike, na.rm=TRUE)) ## -------------------------------------------------------------------------- discard <- low.lib | low.genes | high.spike sce.gse81076 <- sce.gse81076[,!discard] summary(discard) ## -------------------------------------------------------------------------- library(scran) set.seed(1000) clusters <- quickCluster(sce.gse81076, method="igraph", min.mean=0.1) table(clusters) sce.gse81076 <- computeSumFactors(sce.gse81076, min.mean=0.1, clusters=clusters) summary(sizeFactors(sce.gse81076)) ## -------------------------------------------------------------------------- sce.gse81076 <- computeSpikeFactors(sce.gse81076, general.use=FALSE) summary(sizeFactors(sce.gse81076, "ERCC")) ## -------------------------------------------------------------------------- sce.gse81076 <- normalize(sce.gse81076) ## ----var-gse81076, fig.cap="Variance of normalized log-expression values for each gene in the GSE81076 dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."---- block <- paste0(sce.gse81076$Plate, "_", sce.gse81076$Donor) fit <- trendVar(sce.gse81076, block=block, parametric=TRUE) dec <- decomposeVar(sce.gse81076, fit) plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance of log-expression", pch=16) is.spike <- isSpike(sce.gse81076) points(dec$mean[is.spike], dec$total[is.spike], col="red", pch=16) curve(fit$trend(x), col="dodgerblue", add=TRUE) ## -------------------------------------------------------------------------- dec.gse81076 <- dec dec.gse81076$Symbol <- rowData(sce.gse81076)$Symbol dec.gse81076 <- dec.gse81076[order(dec.gse81076$bio, decreasing=TRUE),] head(dec.gse81076) ## ---- echo=FALSE, results="hide"------------------------------------------- rm(gse81076.df) gc() ## -------------------------------------------------------------------------- muraro.fname <- bfcrpath(bfc, file.path("ftp://ftp.ncbi.nlm.nih.gov/geo/series", "GSE85nnn/GSE85241/suppl", "GSE85241%5Fcellsystems%5Fdataset%5F4donors%5Fupdated%2Ecsv%2Egz")) ## -------------------------------------------------------------------------- gse85241.df <- read.table(muraro.fname, sep='\t', header=TRUE, row.names=1, stringsAsFactors=FALSE) dim(gse85241.df) ## -------------------------------------------------------------------------- donor.names <- sub("^(D[0-9]+).*", "\\1", colnames(gse85241.df)) table(donor.names) plate.id <- sub("^D[0-9]+\\.([0-9]+)_.*", "\\1", colnames(gse85241.df)) table(plate.id) ## -------------------------------------------------------------------------- gene.symb <- gsub("__chr.*$", "", rownames(gse85241.df)) is.spike <- grepl("^ERCC-", gene.symb) table(is.spike) library(org.Hs.eg.db) gene.ids <- mapIds(org.Hs.eg.db, keys=gene.symb, keytype="SYMBOL", column="ENSEMBL") gene.ids[is.spike] <- gene.symb[is.spike] keep <- !is.na(gene.ids) & !duplicated(gene.ids) gse85241.df <- gse85241.df[keep,] rownames(gse85241.df) <- gene.ids[keep] summary(keep) ## -------------------------------------------------------------------------- sce.gse85241 <- SingleCellExperiment(list(counts=as.matrix(gse85241.df)), colData=DataFrame(Donor=donor.names, Plate=plate.id), rowData=DataFrame(Symbol=gene.symb[keep])) isSpike(sce.gse85241, "ERCC") <- grepl("^ERCC-", rownames(gse85241.df)) sce.gse85241 ## -------------------------------------------------------------------------- sce.gse85241 <- calculateQCMetrics(sce.gse85241, compact=TRUE) QC <- sce.gse85241$scater_qc low.lib <- isOutlier(QC$all$log10_total_counts, type="lower", nmad=3) low.genes <- isOutlier(QC$all$log10_total_features_by_counts, type="lower", nmad=3) high.spike <- isOutlier(QC$feature_control_ERCC$pct_counts, type="higher", nmad=3) data.frame(LowLib=sum(low.lib), LowNgenes=sum(low.genes), HighSpike=sum(high.spike, na.rm=TRUE)) ## -------------------------------------------------------------------------- discard <- low.lib | low.genes | high.spike sce.gse85241 <- sce.gse85241[,!discard] summary(discard) ## -------------------------------------------------------------------------- set.seed(1000) clusters <- quickCluster(sce.gse85241, min.mean=0.1, method="igraph") table(clusters) sce.gse85241 <- computeSumFactors(sce.gse85241, min.mean=0.1, clusters=clusters) summary(sizeFactors(sce.gse85241)) sce.gse85241 <- computeSpikeFactors(sce.gse85241, general.use=FALSE) summary(sizeFactors(sce.gse85241, "ERCC")) sce.gse85241 <- normalize(sce.gse85241) ## ----var-gse85241, fig.cap="Variance of normalized log-expression values for each gene in the GSE85241 dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."---- block <- paste0(sce.gse85241$Plate, "_", sce.gse85241$Donor) fit <- trendVar(sce.gse85241, block=block, parametric=TRUE) dec <- decomposeVar(sce.gse85241, fit) plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance of log-expression", pch=16) is.spike <- isSpike(sce.gse85241) points(dec$mean[is.spike], dec$total[is.spike], col="red", pch=16) curve(fit$trend(x), col="dodgerblue", add=TRUE) ## -------------------------------------------------------------------------- dec.gse85241 <- dec dec.gse85241$Symbol <- rowData(sce.gse85241)$Symbol dec.gse85241 <- dec.gse85241[order(dec.gse85241$bio, decreasing=TRUE),] head(dec.gse85241) ## ---- echo=FALSE, results="hide"------------------------------------------- rm(gse85241.df) gc() ## -------------------------------------------------------------------------- universe <- intersect(rownames(dec.gse85241), rownames(dec.gse81076)) mean.bio <- (dec.gse85241[universe,"bio"] + dec.gse81076[universe,"bio"])/2 chosen <- universe[mean.bio > 0] length(chosen) ## -------------------------------------------------------------------------- rescaled <- multiBatchNorm(sce.gse85241[universe,], sce.gse81076[universe,]) rescaled.gse85241 <- rescaled[[1]] rescaled.gse81076 <- rescaled[[2]] ## -------------------------------------------------------------------------- set.seed(100) original <- list( GSE81076=logcounts(rescaled.gse81076)[chosen,], GSE85241=logcounts(rescaled.gse85241)[chosen,] ) # Slightly convoluted call to avoid re-writing code later. # Equivalent to fastMNN(GSE81076, GSE85241, k=20, d=50, approximate=TRUE) mnn.out <- do.call(fastMNN, c(original, list(k=20, d=50, approximate=TRUE))) dim(mnn.out$corrected) ## -------------------------------------------------------------------------- mnn.out$batch ## -------------------------------------------------------------------------- mnn.out$pairs ## -------------------------------------------------------------------------- omat <- do.call(cbind, original) sce <- SingleCellExperiment(list(logcounts=omat)) reducedDim(sce, "MNN") <- mnn.out$corrected sce$Batch <- as.character(mnn.out$batch) sce ## ----tsne-batch, fig.width=10, fig.asp=0.6, fig.cap="t-SNE plots of the pancreas datasets, before and after MNN correction. Each point represents a cell and is coloured by the batch of origin."---- set.seed(100) # Using irlba to set up the t-SNE, for speed. osce <- runPCA(sce, ntop=Inf, method="irlba") osce <- runTSNE(osce, use_dimred="PCA") ot <- plotTSNE(osce, colour_by="Batch") + ggtitle("Original") set.seed(100) csce <- runTSNE(sce, use_dimred="MNN") ct <- plotTSNE(csce, colour_by="Batch") + ggtitle("Corrected") multiplot(ot, ct, cols=2) ## ----tsne-markers, fig.width=10, fig.height=10, fig.cap="t-SNE plots after MNN correction, where each point represents a cell and is coloured by its corrected expression of key marker genes for known cell types in the pancreas."---- ct.gcg <- plotTSNE(csce, colour_by="ENSG00000115263") + ggtitle("Alpha cells (GCG)") ct.ins <- plotTSNE(csce, colour_by="ENSG00000254647") + ggtitle("Beta cells (INS)") ct.sst <- plotTSNE(csce, colour_by="ENSG00000157005") + ggtitle("Delta cells (SST)") ct.ppy <- plotTSNE(csce, colour_by="ENSG00000108849") + ggtitle("PP cells (PPY)") multiplot(ct.gcg, ct.ins, ct.sst, ct.ppy, cols=2) ## -------------------------------------------------------------------------- snn.gr <- buildSNNGraph(sce, use.dimred="MNN") clusters <- igraph::cluster_walktrap(snn.gr) table(clusters$membership, sce$Batch) ## ----tsne-cluster, fig.cap="t-SNE plot after MMN correction, where each point represents a cell and is coloured by its cluster identity."---- csce$Cluster <- factor(clusters$membership) plotTSNE(csce, colour_by="Cluster") ## -------------------------------------------------------------------------- m.out <- findMarkers(sce, clusters$membership, block=sce$Batch, direction="up") demo <- m.out[["4"]] # looking at cluster 4 (probably alpha cells). demo <- demo[demo$Top <= 5,] library(org.Hs.eg.db) data.frame(row.names=rownames(demo), Symbol=mapIds(org.Hs.eg.db, keytype="ENSEMBL", keys=rownames(demo), column="SYMBOL"), Top=demo$Top, FDR=demo$FDR) ## ---- echo=FALSE, results="hide"------------------------------------------- # Checking that cluster 4 is what we think it is. stopifnot(all(sapply(demo["ENSG00000115263",-(1:3)], sign)==1)) ## -------------------------------------------------------------------------- # Setting up the design matrix (we remove intercept for full rank # in the final design matrix with the cluster-specific terms). design <- model.matrix(~sce$Batch) design <- design[,-1,drop=FALSE] m.alt <- findMarkers(sce, clusters$membership, design=design, direction="up") demo <- m.alt[["4"]] demo <- demo[demo$Top <= 5,] data.frame(row.names=rownames(demo), Symbol=mapIds(org.Hs.eg.db, keytype="ENSEMBL", keys=rownames(demo), column="SYMBOL"), Top=demo$Top, FDR=demo$FDR) ## -------------------------------------------------------------------------- sessionInfo()