## ---- echo=TRUE---------------------------------------------------------- library(Linnorm) data(Islam2011) #Do not filter gene list: Transformed <- Linnorm(Islam2011) #Filter low count genes FTransformed <- Linnorm(Islam2011, keepAll=FALSE) ## ---- echo=TRUE---------------------------------------------------------- #You can use this file with Excel. #This file includes all genes. write.table(Transformed, "Transformed.txt", quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE) #This file filtered low count genes. write.table(FTransformed, "Transformed.txt", quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE) ## ---- echo=TRUE---------------------------------------------------------- library(Linnorm) data(LIHC) datamatrix <- LIHC ## ---- echo=TRUE---------------------------------------------------------- #10 samples for condition 1 and 10 samples for condition 2. #You might need to edit this line design <- c(rep(1,10),rep(2,10)) #There lines can be copied directly. design <- model.matrix(~ 0+factor(design)) colnames(design) <- c("group1", "group2") rownames(design) <- colnames(datamatrix) ## ---- echo=TRUE---------------------------------------------------------- library(Linnorm) #The Linnorm-limma pipeline only consists of one line. DEG_Results <- Linnorm.limma(datamatrix,design) #The DEG_Results matrix contains your DEG analysis results. ## ---- echo=TRUE---------------------------------------------------------- library(Linnorm) #Just add output="Both" into the argument list BothResults <- Linnorm.limma(datamatrix,design,output="Both") #To separate results into two matrices: DEG_Results <- BothResults$DEResults TransformedMatrix <- BothResults$Linnorm #The DEG_Results matrix now contains DEG analysis results. #The TransformedMatrix matrix now contains a Linnorm #Transformed dataset. ## ------------------------------------------------------------------------ write.table(DEG_Results, "DEG_Results.txt", quote=FALSE, sep="\t", col.names=TRUE,row.names=TRUE) ## ---- echo=TRUE---------------------------------------------------------- Genes10 <- DEG_Results[order(DEG_Results[,"adj.P.Val"]),][1:10,] #Users can print the gene list by the following command: #print(Genes10) #logFC: log 2 fold change of the gene. #XPM: If input is raw count or CPM, XPM is CPM. # If input is RPKM, FPKM or TPM, XPM is TPM. #t: moderated t statistic. #P.Value: p value. #adj.P.Val: Adjusted p value. This is also called False Discovery Rate or q value.} #B: log odds that the feature is differential. ## ----kable1, echo=FALSE-------------------------------------------------- library(knitr) kable(Genes10, digits=4) ## ---- echo=TRUE---------------------------------------------------------- NoINF <- DEG_Results[which(!is.infinite(DEG_Results[,1])),] ## ---- echo=TRUE---------------------------------------------------------- SignificantGenes <- NoINF[NoINF[,5] <= 0.05,1] ## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------- plot(x=NoINF[,1], y=NoINF[,5], col=ifelse(NoINF[,1] %in% SignificantGenes, "red", "green"),log="y", ylim = rev(range(NoINF[,5])), main="Volcano Plot", xlab="log2 Fold Change", ylab="q values", cex = 0.5) ## ---- echo=TRUE---------------------------------------------------------- library(Linnorm) data(Islam2011) IslamData <- Islam2011[,1:92] ## ---- echo=TRUE---------------------------------------------------------- #48 samples for condition 1 and 44 samples for condition 2. #You might need to edit this line design <- c(rep(1,48),rep(2,44)) #There lines can be copied directly. design <- model.matrix(~ 0+factor(design)) colnames(design) <- c("group1", "group2") rownames(design) <- colnames(IslamData) ## ---- echo=TRUE---------------------------------------------------------- #Example 1: Filter low count genes. #and genes with high technical noise. scRNAseqResults <- Linnorm.limma(IslamData,design,keepAll=FALSE) #Example 2: Do not filter gene list. scRNAseqResults <- Linnorm.limma(IslamData,design) ## ---- echo=TRUE---------------------------------------------------------- library(Linnorm) data(Islam2011) ## ---- echo=TRUE---------------------------------------------------------- PCA.results <- Linnorm.PCA(Islam2011[,1:92]) ## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------- #Here, we can see multiple clusters. print(PCA.results$plot$plot) ## ---- echo=TRUE---------------------------------------------------------- #The first 48 samples belong to mouse embryonic stem cells. Groups <- rep("ES_Cells",48) #The next 44 samples are mouse embryonic fibroblasts. Groups <- c(Groups, rep("EF_Cells",44)) ## ---- echo=TRUE---------------------------------------------------------- #Useful arguments: #Group: #allows user to provide each sample's information. #num_center: #how many clusters are supposed to be there. #num_PC #how many principal components should be used in k-means #clustering. PCA.results <- Linnorm.PCA(Islam2011[,1:92], Group=Groups, num_center=2, num_PC=3) ## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------- #Here, we can see two clusters. print(PCA.results$plot$plot) ## ---- echo=TRUE---------------------------------------------------------- data(Islam2011) #Obtain embryonic stem cell data datamatrix <- Islam2011[,1:48] ## ---- echo=TRUE---------------------------------------------------------- #Setting plotNetwork to TRUE will create a figure file in your current directory. #Setting it to FALSE will stop it from creating a figure file, but users can plot it #manually later using the igraph object in the output. #For this vignette, we will plot it manually in step 4. results <- Linnorm.Cor(datamatrix, plotNetwork=FALSE, #Edge color when correlation is positive plot.Pos.cor.col="red", #Edge color when correlation is negative plot.Neg.cor.col="green") ## ---- echo=TRUE---------------------------------------------------------- Genes10 <- results$Results[order(results$Results[,5]),][1:10,] #Users can print the gene list by the following command: #print(Genes10) ## ----kable2, echo=FALSE-------------------------------------------------- library(knitr) kable(Genes10, digits=4, row.names=FALSE) ## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------- library(igraph) Thislayout <- layout_with_kk(results$igraph) plot(results$igraph, #Vertex size vertex.size=2, #Vertex color, based on clustering results vertex.color=results$Cluster$Cluster, #Vertex frame color vertex.frame.color="transparent", #Vertex label color (the gene names) vertex.label.color="black", #Vertex label size vertex.label.cex=0.05, #This is how much the edges should be curved. edge.curved=0.1, #Edge width edge.width=0.05, #Use the layout created previously layout=Thislayout ) ## ---- echo=TRUE---------------------------------------------------------- TheCluster <- which(results$Cluster[,1] == "Mmp2") ## ---- echo=TRUE---------------------------------------------------------- #Index of the genes ListOfGenes <- which(results$Cluster[,2] == TheCluster) #Names of the genes GeneNames <- results$Cluster[ListOfGenes,1] #Users can write these genes into a file for further analysis. ## ---- echo=TRUE---------------------------------------------------------- top100 <- results$Results[order(results$Results[,4],decreasing=FALSE)[1:100],1] ## ---- echo=TRUE---------------------------------------------------------- Top100.Cor.Matrix <- results$Cor.Matrix[top100,top100] ## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------- library(RColorBrewer) library(gplots) heatmap.2(as.matrix(Top100.Cor.Matrix), #Hierarchical clustering on both row and column Rowv=TRUE, Colv=TRUE, #Center white color at correlation 0 symbreaks=TRUE, #Turn off level trace trace="none", #x and y axis labels xlab = 'Genes', ylab = "Genes", #Turn off density info density.info='none', #Control color key.ylab=NA, col=colorRampPalette(c("blue", "white", "yellow"))(n = 1000), lmat=rbind(c(4, 3), c(2, 1)), #Gene name font size cexRow=0.3, cexCol=0.3, #Margins margins = c(8, 8) ) ## ---- echo=TRUE---------------------------------------------------------- data(Islam2011) #Identify spike in genes: SPIKEIN <- rownames(Islam2011)[c(grep("Ppia",rownames(Islam2011)), grep("H2afz",rownames(Islam2011)),grep("Hprt1",rownames(Islam2011)))] ## ---- echo=TRUE---------------------------------------------------------- #The first 48 columns are the embryonic stem cells. results <- Linnorm.HVar(Islam2011[,1:48], spikein=SPIKEIN) ## ---- echo=TRUE---------------------------------------------------------- resultsdata <- results$Results Genes10 <- resultsdata[order(resultsdata[,"q.value"]),][1:10,3:5] #Users can print the gene list by the following command: #print(Genes10) ## ----kable3, echo=FALSE-------------------------------------------------- library(knitr) kable(Genes10, digits=4) ## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------- print(results$plot$plot) ## ---- echo=TRUE---------------------------------------------------------- data(Islam2011) Islam <- Islam2011[,1:92] ## ---- echo=TRUE---------------------------------------------------------- #48 ESC, 44 EF, and 4 NegCtrl Group <- c(rep("ESC",48),rep("EF",44)) colnames(Islam) <- paste(colnames(Islam),Group,sep="_") ## ---- echo=TRUE---------------------------------------------------------- #Note that there are 3 known clusters. HClust.Results <- Linnorm.HClust(Islam,Group=Group, num_Clust=2, fontsize=1.5) ## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------- print(HClust.Results$plot$plot) ## ---- echo=TRUE---------------------------------------------------------- library(Linnorm) data(LIHC) ## ---- echo=TRUE---------------------------------------------------------- #Now, we can calculate fold changes between #sample set 1 and sample set 2. #Index of sample set 1 set1 <- 1:10 #Index of sample set 2 set2 <- 11:20 #Define a function that calculates log 2 fold change from TPM + 1: log2fc <- function(x) { return(log(mean(x[set1] + 1)/mean(x[set2] + 1),2)) } #Calculate log 2 fold change of each gene in the dataset: foldchanges <- unlist(apply(LIHC,1,log2fc)) #Put resulting data into a matrix FCMatrix <- matrix(foldchanges, ncol=1) rownames(FCMatrix) <- rownames(LIHC) colnames(FCMatrix) <- c("Log 2 Fold Change") #Remove Infinite values. FCMatrix <- FCMatrix[!is.infinite(foldchanges),,drop=FALSE] #Now FCMatrix contains fold change results. ## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------- Density <- density(FCMatrix[,1]) plot(Density$x,Density$y,type="n",xlab="Log 2 Fold Change", ylab="Probability Density",) lines(Density$x,Density$y, lwd=1.5, col="blue") title("Probability Density of Fold Change.\nTCGA Partial LIHC data Paired Tumor vs Adjacent Normal") legend("topright",legend=paste("mean = ", round(mean(FCMatrix[,1]),2), "\nStdev = ", round(sd(FCMatrix[,1]),2))) ## ---- echo=TRUE---------------------------------------------------------- data(SEQC) SampleA <- SEQC ## ---- echo=TRUE---------------------------------------------------------- library(Linnorm) #This will generate two sets of RNA-seq data with 5 replicates each. #It will have 20000 genes totally with 5000 genes being differentially #expressed. It has the Negative Binomial distribution. SimulatedData <- RnaXSim(SampleA) ## ---- echo=TRUE---------------------------------------------------------- #Index of differentially expressed genes. DE_Index <- SimulatedData[[2]] #Expression Matrix ExpMatrix <- SimulatedData[[1]] #Sample Set 1 Sample1 <- ExpMatrix[,1:3] #Sample Set 2 Sample2 <- ExpMatrix[,4:6] ## ---- echo=TRUE---------------------------------------------------------- data(SEQC) SampleA <- SEQC ## ---- echo=TRUE---------------------------------------------------------- library(Linnorm) SimulatedData <- RnaXSim(SampleA, distribution="Gamma", #Distribution in the simulated dataset. #Put "NB" for Negative Binomial, "Gamma" for Gamma, #"Poisson" for Poisson and "LogNorm" for Log Normal distribution. NumRep=5, #Number of replicates in each sample set. #5 will generate 10 samples in total. NumDiff = 1000, #Number of differentially expressed genes. NumFea = 5000 #Total number of genes in the dataset ) ## ---- echo=TRUE---------------------------------------------------------- #Index of differentially expressed genes. DE_Index <- SimulatedData[[2]] #Expression Matrix ExpMatrix <- SimulatedData[[1]] #Simulated Sample Set 1 Sample1 <- ExpMatrix[,1:3] #Simulated Sample Set 2 Sample2 <- ExpMatrix[,4:6]