%\VignetteIndexEntry{MultiRNAflow: A R package for analysing RNA-seq raw counts with different time points and several biological conditions.} %\VignettePackage{MultiRNAflow} %\VignetteEngine{knitr::knitr} \documentclass{article} <>= BiocStyle::latex(use.unsrturl=FALSE) @ %\usepackage{booktabs} % book-quality tables \usepackage{hyperref} \setcounter{secnumdepth}{4} \setcounter{tocdepth}{3} \newcommand{\subsubsubsection}[1]{\paragraph{#1}\mbox{}\\} \newcommand\BiocStyle{\Rpackage{BiocStyle}} \newcommand\knitr{\Rpackage{knitr}} \newcommand\rmarkdown{\Rpackage{rmarkdown}} \newcommand\Sweave{\software{Sweave}} \newcommand\latex[1]{{\ttfamily #1}} \makeatletter \def\thefpsfigure{\fps@figure} \makeatother \newcommand{\exitem}[3]{% \item \latex{\textbackslash#1\{#2\}} #3 \csname#1\endcsname{#2}.% } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \title{MultiRNAflow: An R package for integrated analysis of temporal RNA-seq data with multiple biological conditions} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \author[1]{Loubaton Rodolphe} \author[1]{Champagnat Nicolas} \author[1]{Vallois Pierre} \author[2,3]{Vallat Laurent} \affil[1]{Universit\'e de Lorraine, CNRS, Inria, IECL, F-54000 Nancy, France} \affil[2]{Universit\'e de Strasbourg, CNRS, UMR-7242 Biotechnology and cell signaling, F-67400 Illkirch, France} \affil[3]{Department of molecular genetic of cancers, Strasbourg University Hospital, F-67200 Strasbourg, France} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{document} %%%% %{vignettes%/MultiRNAflowImage%/ %%%% %{MultiRNAflowImage%/ <>= library(knitr) opts_chunk$set( concordance=TRUE ) @ \maketitle %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{abstract} The dynamic transcriptional mechanisms that govern eukaryotic cell function can now be analyzed by RNA sequencing (RNAseq). However, the packages currently available for the analysis of raw sequencing data do not provide automatic analysis of complex experimental designs with multiple biological conditions and multiple analysis time-points.\newline The MultiRNAflow suite combines several packages in a unified framework allowing exploratory and supervised statistical analysis of temporal data for multiple biological conditions. \end{abstract} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \renewcommand{\abstractname}{To cite this package} \begin{abstract} Rodolphe Loubaton, Nicolas Champagnat, Pierre Vallois and Laurent Vallat, MultiRNAflow: integrated analysis of temporal RNA-seq data with multiple biological conditions. In revision (2024). \end{abstract} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newpage \tableofcontents \newpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% <>= options(prompt=" ", continue=" ") @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \section{Introduction}\label{sec:introduction} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Context}\label{sec:context} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In eukaryotic cells, genes are expressed in the form of RNA molecules during the transcriptional process which are then translated into proteins with a cellular function. In resting cells, at the steady state, transcription is affected by stochastic phenomena generating a transcriptional noise within cells. After modification of the cellular environment (cellular stress, receptor activation), hundreds of genes are activated, inducing a dynamic temporal transcriptional response allowing an adapted response of the cells to the initial modification of the environment \cite{Yosef2013GRNdynamicPerturbation}. Alterations in these temporal transcriptional responses are at the origin of pathologies (\textit{e.g.} cancer) and are extensively studied by biologists \cite{BarJoseph2012DynamicBiologicalProcesses} through sometimes complex experimental designs. Recent technological developments now make it possible to quantify the transcription of all genes in the genome by sequencing RNA molecules (RNAseq). These analysis generate raw count data whose properties (discrete data) are different from the fluorescence intensity data (continuous data) generated by previous microarray techniques. These data are presented as a transcriptome table reporting, for each sample, the number of reads for each gene, obtained from the alignment of collected RNA transcripts to a reference genome. The raw count of a gene (or transcript) corresponds to the number of reads mapped to the RNA sequence of this gene (or transcript). The number of reads of a gene depends on the length of the gene and experimental or systematic errors may occur during sequencing (uncertainty on sequencing depth, effective library sizes...), which require the transformation of the original raw counts.\newline The first method developed consists to compute count per million (CPM). Although the previous method corrects the effective library sizes problem, it should not be used for comparison between samples~\cite{Zhao2020MisuseTPMRPKM, Wagner2012TPM}, particularly if samples belong to different biological conditions and/or time points.\newline New methods of normalization have been developed in order to be able to compare gene expression between samples which belong to different biological conditions and/or time points. These methods of normalization, ensure that differences between samples are only due to their membership to different biological conditions and/or time points. The most used R packages for normalization are DESeq2 \cite{Love2014DESeq2} and EdgeR \cite{Robinson2010edgeR}.\newline Another motivation of normalization is to deduce a number of RNA transcripts of genes. This requires the so called reads per kilobase of transcripts per million reads mapped (RPKM) \cite{Mortazavi2008QuantifyingTranscriptomesRNAseq} or transcripts per million (TPM) \cite{Li2010TPM,Wagner2012TPM}, as these methods correct both the effective library sizes and the dependence of the number of reads of a gene with its length. The two goals of normalization of raw counts data are 1) to allow for unsupervised analysis of data (within samples or between samples) and 2) to compare gene reads in different biological condition and/or time points, to detect so-called \textbf{Differentially Expressed} (DE) genes. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Context}\label{sec:stateart} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Several R packages propose tools to normalize data, realize unsupervised analysis and find DE genes, such as IDEAL~\cite{Marini2020IDEAL}, RNASeqR~\cite{Chao2021RNASeqR}, SeqGSEA\newline \cite{wang_seqgsea_2014} and RNAflow~\cite{Lataretu2020RNAflow}. These packages use DESeq2~\cite{Love2014DESeq2} and/or EdgedR~\cite{Robinson2010edgeR} in order to realize the normalization and DE analysis. All of them can detect DE genes in samples belonging to different biological conditions, although RNASeqR is limited to only two biological conditions. Some of them also perform GO enrichment analysis. However, these packages were not designed to deal with temporal data, although they could be adapted to this situation. None of them offer a unified and automatized framework to analyze RNA-seq data with both several time points and more than two biological conditions. Furthermore, these packages do not allow to automatically select subsets of genes that can be relevant for GO enrichment analysis, such as genes which are specific to a given biological condition and/or to a given time, or genes with particular DE patterns. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Our R package MultiRNAflow}\label{sec:DescriptionPackage} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The MultiRNAflow suite gathers in a unified framework methodological tools found in various existing packages allowing to perform: \begin{enumerate} \item Exploratory (unsupervised) analysis of the data. \item Statistical (supervised) analysis of dynamic transcriptional expression (DE genes), based on DESeq2 package~\cite{Love2014DESeq2}. \item Functional and GO analysis of subsets of genes automatically selected by the package, such as specific genes or genes with a given DE temporal pattern. \end{enumerate} The package automates a commonly used workflow of analysis for studying complex biological phenomena (used \textit{e.g.} in~\cite{Schleiss2021LeukProliferation}). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Supported dataset}\label{sec:dataset} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The package supports transcriptional RNAseq raw count data (and can be adapted to single cell RNAseq) from an experimental design with multiple conditions and/or multiple times. The experimental design supported by our packages assumes that there is a reference time noted $t_0$, distinct from the other times noted $t_1$ to $t_n$, which corresponds to a set of reference measurements to which the others are to be compared (\textit{e.g.} as in~\cite{Schleiss2021LeukProliferation}, where $t_0$ corresponds to the basal state of the cell before activation of a cell receptor, and the experiments at times $t_1$ to $t_n$ measure gene expression at different times after activation of the receptor). Our package is not designed to analyze experimental designs requiring to compare measurements between any pairs of times.\newline %%%%%%%%%% %%%%%%%%%% The package provides numerous graphical outputs that can be selected by the user. To illustrate these outputs, we gather in Figure~\ref{fig:ApllicationNoteImage} a selection of graphics obtained from the dataset of~\cite{Weger2021TemporalMouseLiver}, which analyzes the role of invalidation of Bmal1 and Cry1/2 genes on murine transcriptional dynamics. The experimental map contains 4 biological conditions (Bmal1 wild type (wt), Bmal1 knock-out (ko), Cry1/2 wt and Cry1/2 ko) and 6 time points each ($t_0=0$h, $t_1=4$h, $t_2=8$h, $t_3=12$h, $t_4=16$h and $t_5=20$h), with 4 replicates (Figure~\ref{fig:ApllicationNoteImage}.A).\newline The dataset is a table of raw counts where lines correspond to genes and columns correspond to samples. Each sample shows the raw counts of an individual sequencing, corresponding to a biological condition, an individual sampling in this biological condition and a time. In this document, we illustrate the use of our package on four examples of datasets (see section \nameref{sec:DatasetPackage}) corresponding to four cases: \begin{itemize} \item \textbf{Case 1}. Samples belong to different biological conditions. \item \textbf{Case 2}. Measures were realized at different time points. \item \textbf{Case 3}. Samples belong to two biological conditions and measures were realized at different time points. \item \textbf{Case 4}. Samples belong to different biological conditions and measures were realized at different time points. \end{itemize} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Steps of the algorithm}\label{sec:stepsalgo} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The package MultiRNAflow realizes the following steps: \begin{itemize} \item Normalization, realized with the R package DESeq2 \cite{Love2014DESeq2}. \item Exploratory data unsupervised analysis which includes \begin{itemize} \item Visualization of individual patterns using factorial analysis with the R package FactoMineR \cite{Le2008FactoMineR}. \item Visualization of biological conditions and/or temporal clusters with the R package ComplexHeatmap \cite{Gu2016ComplexHeatmaps} \item Visualization of groups of genes with similar temporal behavior with the R package Mfuzz \cite{Futschik2005SoftClusteringMicroarray,Kumar2007Mfuzz} \end{itemize} \item Statistical supervised analysis of the transcriptional response of different groups of individuals over time with the R package DESeq2 \cite{Love2014DESeq2}, which includes \begin{itemize} \item Temporal statistical DE analysis \item Statistical DE analysis by biological condition \item Combination of temporal and biological condition statistical DE analysis \item Gene Ontology (GO) enrichment analysis using the R package gprofiler2\newline \cite{Kolberg2020gprofiler2} (Figure~\ref{fig:ApllicationNoteImage}.K), and automatic generation of outputs that can be implemented in DAVID~\cite{Sherman2021DAVID}, Webgestalt~\cite{Liao2019WebGestalt} (Figure~\ref{fig:ApllicationNoteImage}.L), GSEA~\cite{Subramanian2005GSEA} (Figure~\ref{fig:ApllicationNoteImage}.M), gProfiler~\cite{Raudvere2019gProfiler}, Panther~\cite{Thomas2020PANTHER}, ShinyGO~\cite{GE2020ShinyGO}, Enrichr~\cite{Kuleshov2016Enrichr} and GOrilla~\cite{Eden2009GOrilla}. for further analysis using these databases. \end{itemize} \end{itemize} %% , gProfiler, Panther, ShinyGO, Enrichr and GOrilla. %%% Il y en a d'autres Below, we give a short description of each of these steps before a full description of the package outputs for four examples of datasets. Figure\ref{fig:ApllicationNoteImage} illustrates the short description below. It gathers a selection of graphs produced by our package for the \nameref{sec:dataWeger2021} \cite{Weger2021TemporalMouseLiver} corresponding to \textbf{case 4}. \clearpage %% [width=0.75\linewidth,height=0.75\textheight] \begin{figure} \centering \includegraphics[width=1.2\linewidth,height=1.2\textheight]{MultiRNAflowImage/MultiRNAflow_Figure.pdf} \caption{Outputs from the package MultiRNAflow with a dataset containing several biological conditions and several time points (experimental design shown in (A)). %%%%% Exploratory analysis includes 3D PCA (B), temporal clustering of expression (C) and detailed temporal gene expression (D). %%%%% Supervised statistical analysis (experimental map shown in (E)) includes DE genes between each time and the reference time for each condition (F and G); specific DE genes for each condition at each time (H) or at least at one time point (I); signature DE genes of each condition and each time (J). %%%%% GO enrichment analysis is realized with the R package gprofiler2 (K) or by generating input files for several GO software programs, such as Webgestalt (L) or GSEA (M).} \label{fig:ApllicationNoteImage} \end{figure} \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Normalization}\label{sec:stepsnormalization} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The function \textbf{DATAnormalization()} of our package allows to realize the three methods of normalization proposed in DESeq2 and RPKM, which must be performed on raw counts for analysis (like DE analysis): \begin{itemize} \item Relative log expression (rle) \cite{Anders2010SizeFactorVST}. Each column of the raw counts is scaled by a specific scalar called size factors, estimated using the "median ratio method". \item Regularized logarithm (rlog) \cite{Love2014DESeq2}. This method of normalization transforms the count data to the log2 scale in a way that minimizes differences between samples for rows (so genes) with small counts. This transformation removes the dependence of the variance on the mean, particularly the high variance of the logarithm of count data when the mean is low. This method of normalization is realized by the R function \texttt{rlog()} of the package DESeq2. \item Variance Stabilizing Transformation (vst) \cite{Anders2010SizeFactorVST}. This method of normalization is similar to the rlog normalization. The vst normalization is faster but the rlog normalization is more robust in the case when the size factors vary widely. This method of normalization is realized by the R function \texttt{vst()} of the package DESeq2. \end{itemize} As mentioned in the DESeq2 manual, the rle transformation is used to realize DE analysis and the rlog and vst transformations are advised for unsupervised analysis (factorial analysis, clustering) or other machine learning methods. In addition, the function \textbf{DATAnormalization()} allows to plot the distribution of normalized read counts for each sample using boxplots. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Exploratory data analysis (unsupervised analysis)} \label{sec:stepsPCAhcpc} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsubsection{Factorial analysis}\label{sec:stepsPCA} Factorial analysis is realized by the two functions \textbf{PCAanalysis()} and \textbf{HCPCanalysis()} which implement two methods: Principal Component Analysis (PCA) and Hierarchical Clustering on Principle Components (HCPC). The two methods allow to visualize the temporal evolution of the transcription within each group of individuals and similarities or differences in transcriptional behaviors between groups. Of note, the PCA visualization is optimized thanks to the dynamic 3D PCA (see Figure~\ref{fig:ApllicationNoteImage}.B) allowing several viewing angles. \begin{itemize} \item \textbf{Case 1}. In the PCA graphs, samples are colored according to biological condition. \item \textbf{Case 2}. In the PCA graphs, consecutive time points for a same sample are linked to help visualization of temporal patterns. \item \textbf{Cases 3 and 4}. The PCA graphs combine the two previous displaying. \end{itemize} \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsubsection{Visualization of groups of genes with similar temporal behavior} \label{sec:stepsMFUZZ} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The R function \textbf{MFUZZanalysis()} allows, in \textbf{case 2}, \textbf{case 3} and \textbf{case 4}, to find the most common temporal behavior among all genes and all individuals in a given biological condition. This is done using the R package Mfuzz \cite{Futschik2005SoftClusteringMicroarray, Kumar2007Mfuzz} based on soft clustering. When there are several replicates per time, the Mfuzz package realizes soft clustering from the mean expression per time for each gene (see Figure~\ref{fig:ApllicationNoteImage}.C). When there are several biological conditions, the algorithm realizes the Mfuzz analysis for each biological condition. As with most clustering method, we need to find out the optimal number of clusters. Although a method is already implemented in the Mfuzz package, this method seems to fail when the number of genes is too big. Our function \textbf{MFUZZclusternumber()} finds the optimal number of cluster using kmeans() from the R package stats \cite{Rteam2021R} or HCPC() from the R package FactoMineR \cite{Le2008FactoMineR}. Among the outputs, \textbf{MFUZZclusternumber()} returns \begin{itemize} \item a graph indicating the selected number of clusters for each biological condition \item the results of the soft clustering for each biological condition (see Figure~\ref{fig:ApllicationNoteImage}.C) \end{itemize} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsubsection{Visualization of the data}\label{sec:stepsTemporalExpr} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The R function \textbf{DATAplotExpressionGenes()} allows to plot gene expression profiles of a selection of genes according to time and/or biological conditions (see Figure~\ref{fig:ApllicationNoteImage}.D). The user can either use raw counts data or normalized data. \begin{itemize} \item \textbf{Case 1}. The output is a graph where are plotted: a box plot, a violin plot, and error bars (standard deviation) for each biological condition. \item \textbf{Case 2}. The output is a graph where are plotted: the evolution of the expression of each replicate across time (red lines) and the evolution of the mean and the standard deviation of the expression across time (black lines). \item \textbf{Cases 3 and 4}. The output is a graph where are plotted: the evolution of the mean and the standard deviation of the expression across time for each biological condition. \end{itemize} \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Statistical analysis of the transcriptional response of different biological conditions of individuals over time} \label{sec:SupervisedStatisticalAnalysis} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsubsection{Differentially expressed (DE) genes with DESeq2} \label{sec:stepsDE} The function \textbf{DEanalysisGlobal()} search for DE genes. \textbf{Case 1}. Our algorithm searches for differentially expressed (DE) genes between all pairs of biological conditions. This allows in particular to determine which genes are specific to each biological condition. A gene is called specific to a biological condition A, if the gene is DE between A and any other biological conditions, but not DE between any pairs of other biological conditions.\newline Among the outputs, \textbf{DEanalysisGlobal()} returns %%%% \begin{itemize} \item a Venn barplot which gives the number of genes for each possible intersection. We consider that a set of pairs of biological conditions forms an intersection if there is at least one gene which is DE for each of these pairs of biological conditions, but not for the others. \item a barplot which gives the number of specific (and over- and under-expressed, also often called up- and down-regulated) genes per biological condition. \end{itemize} \textbf{Case 2}. Our algorithm looks for differentially expressed genes between each time \(t_i\) (\(1\leq i\leq n\)) and the reference time \(t_0\).\newline Among the outputs, \textbf{DEanalysisGlobal()} returns \begin{itemize} \item an alluvial graph of differentially expressed (DE) genes. \item a Venn barplot which gives the number of genes per temporal pattern. By temporal pattern, we mean the set of times \(t_i\) such that the gene is DE between \(t_i\) and the reference time \(t_0\). \end{itemize} \textbf{Case 3} and \textbf{Case 4}. Our algorithm realizes a mix of the two previous cases. First, for each biological condition, the algorithm realizes \textbf{Case 2}. Then for each time, the algorithm realizes \textbf{Case 1}. We then can find specific genes. A gene is called specific to a biological condition A at a time \(t_i\), if the gene is DE between A and any other biological conditions at time \(t_i\), but not DE between any pairs of other biological conditions at time \(t_i\). The algorithm also finds signature genes: a gene is called signature of a biological condition A at a given time \(t_i\) if the gene is specific for A at time \(t_i\) and DE between \(t_i\) versus \(t_0\) for A. Among the outputs, \textbf{DEanalysisGlobal()} returns \begin{itemize} \item An alluvial graph of differentially expressed (DE) genes, for each biological condition (see Figure~\ref{fig:ApllicationNoteImage}.G). \item A barplot which gives the number of DE genes per time, for each biological condition (see Figure~\ref{fig:ApllicationNoteImage}.F). \item A barplot which gives the number of specific genes for each biological condition, one per time (see Figure~\ref{fig:ApllicationNoteImage}.H). \item An alluvial graph of genes which are specific at least at one time, for each biological condition (see Figure~\ref{fig:ApllicationNoteImage}.I). \item A graph which gives for each biological condition, the number of signature genes and non signature genes per time \(t_i\) versus the reference time \(t_0\) (see Figure~\ref{fig:ApllicationNoteImage}.J). \end{itemize} \subsubsubsection{Heatmaps, ratio intensity (MA) plots and volcano plots} \label{sec:stepsVolMAheat} Clustering of samples versus genes allows visualization of correlations between gene expressions according to biological conditions or times. Clustering of samples versus samples allows visualization of correlations between individuals and groups. Given the high number of genes in a dataset, the heatmaps are realized after the supervised analysis in order to reduce the number of genes. \subsubsubsection{Gene ontology and gene enrichment}\label{sec:stepsGO} Gene Ontology (GO) enrichment analysis search for a functional profile of DE genes and better understand the underlying biological processes. We recommend the most used online tools and software: GSEA \cite{Subramanian2005GSEA} (see Figure~\ref{fig:ApllicationNoteImage}.M), DAVID \cite{Sherman2021DAVID}, WebGestalt \cite{Liao2019WebGestalt} (see Figure~\ref{fig:ApllicationNoteImage}.L), g:Profiler \cite{Raudvere2019gProfiler}, Panther \cite{Thomas2020PANTHER}, ShinyGO \cite{GE2020ShinyGO}, Enrichr \cite{Kuleshov2016Enrichr} and GOrilla \cite{Eden2009GOrilla}. Each of these softwares and online tools requires specific input files in order to realize their analysis. The R function \textbf{GSEApreprocessing()} automatically creates all required files. Alternatively, the function \textbf{GSEAquickAnalysis()} provides a GSEA analysis with the R package gprofiler2 \cite{Kolberg2020gprofiler2}. Among the ouputs, \textbf{GSEAquickAnalysis()} returns %%%%%%%%%%%%%%%%%%%%%%%% \begin{itemize} %% \item a data.frame giving information about all detected gene ontologies %% in the list of associated genes. %%(output \texttt{SEresGprofiler2Fission\$GSEAresults}) \item a lollipop graph (Figure~\ref{fig:FissionLollipop}) showing the most important Gene Ontologies ranked by their $-\log_{10}(\mbox{pvalue})$. The y-axis indicates the \texttt{MaxNumberGO} most significant gene ontologies and pathways associated to the selected DE genes. The gene ontologies and pathways are sorted into descending order. The x-axis indicates the \(-log10(pvalues)\). The higher is a lollipop the more significant is a gene ontology or pathway. A lollipop is yellow if the pvalues is smaller than 0.05 (significant) and blue otherwise. \item A Manhattan plot (Figure~\ref{fig:FissionManhattan}) ranking all genes ontologies according to the functional database (G0::BP, G0::CC, G0::MF and KEGG) \end{itemize} \clearpage %% [width=0.75\linewidth,height=0.75\textheight] %% [scale=4.0] \begin{figure} \centering \includegraphics[width=1.2\linewidth,height=1.2\textheight]{MultiRNAflowImage/Fission_2-5LollipopChart.pdf} \caption{Lollipop chart giving the most significant gene ontologies} \label{fig:FissionLollipop} \end{figure} \vfill \begin{figure} \centering \includegraphics[width=1.2\linewidth,height=1.2\textheight]{MultiRNAflowImage/Fission_2-5ManhattanPlot.pdf} \caption{Mahattan plot indicating all genes ontologies} \label{fig:FissionManhattan} \end{figure} \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Dataset used as examples in the package} \label{sec:DatasetPackage} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In order to explain the use of each function in our package, we use four datasets. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Example of MultiRNAflow in case 1, several biological conditions: Mouse dataset 1} \label{sec:dataAntoszewski2022} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The \textbf{Mouse dataset 1} \cite{Antoszewski2022TcellNotch1} is accessible on the Gene Expression Omnibus (GEO) database with the accession number GSE169116. This dataset contains the transcription profile of 12 mice belonging to 4 biological conditions: \begin{itemize} \item 3 mice with wild type Notch1 and wild type Tcf1 \item 3 mice with wild type Notch1 and Tcf1 knocked-down \item 3 mice with Notch1 induced and wild type Tcf1 \item 3 mice with Notch1 induced and Tcf1 knocked-down. \end{itemize} The dataset contains temporal expression data of 39017 genes \cite{Antoszewski2022TcellNotch1}. %% Notch1 is a well-established lineage specifier for T cells and among the most %% frequently mutated genes throughout all subclasses of T cell acute %% lymphoblastic leukemia \cite{Antoszewski2022TcellNotch1}. To illustrate the use of our package in \textbf{case 1}, we selected 500 genes giving a representative sample of each DE profile across biological conditions, in particular genes that are specific to each biological condition.\newline This sub dataset is saved in the file \textbf{RawCounts\_Antoszewski2022\_MOUSEsub500}. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Example of MultiRNAflow in case 2, several time points: Fission dataset} \label{sec:dataLeong2014} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The \textbf{Fission dataset} \cite{Leong2014FissionOsmoticStress} is accessible on the Gene Expression Omnibus (GEO) database with the accession number GSE56761. The dataset can also be obtained with the R package "fission" \cite{Leong2014FissionOsmoticStress}. This dataset contains the temporal transcription profile of 18 wild type fission yeasts (wt) and 18 fission yeasts where atf1 is knocked-out (mut), hence 36 samples. The dataset contains temporal expression data of 7039 genes. Data were collected 0, 15, 30, 60, 120 and 180 minutes after an osmotic stress. The gene atf1 codes for a transcription factor which alters sensitivity to oxidative stress. To illustrate the use of our package in \textbf{case 2}, we focus on the biological condition wt and select 500 genes giving a representative sample of each temporal DE profile in this biological condition. This sub dataset is saved in the file \textbf{RawCounts\_Leong2014\_FISSIONsub500wt}. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Example of MultiRNAflow in case 3, several time points and two biological conditions: Leukemia dataset} \label{sec:dataSchleiss2021} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The \textbf{Leukemia dataset} \cite{Schleiss2021LeukProliferation} is accessible on the Gene Expression Omnibus (GEO) database with the accession number GSE130385. This dataset contains the temporal transcription profile of 3 Proliferating (P) and 3 Non Proliferating (NP) primary chronic lymphocytic leukemia (CLL) B-cells samples. Data were collected at 0, 1h, 1h30, 3h30, 6h30, 12h, 24h, 48h and 96h after cell stimulation (so $(3+3)\times 9=54$ samples in total). The latest time point corresponds to the emergence of the proliferation clusters. The dataset contains temporal expression data of 25369 genes. To illustrate the use of our package in \textbf{case 3} with two biological conditions, we selected 500 genes giving a representative sample of each DE profile across time and biological conditions, in particular genes that are signature genes of each biological condition. This sub dataset is saved in the file \textbf{RawCounts\_Schleiss2021\_CLLsub500}. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Example of MultiRNAflow in case 4, several time points and more than two biological conditions: Mouse dataset 2} \label{sec:dataWeger2021} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The \textbf{Mouse dataset 2} \cite{Weger2021TemporalMouseLiver} is accessible on the Gene Expression Omnibus (GEO) database with the accession number GSE135898. This dataset contains the temporal transcription profile of 16 mice with Bmal1 and Cry1{\slash}2 knocked-down under an ad libitum (AL) or night restricted feeding (RF) regimen. Data were collected at 0h, 4h, 8h, 12h, 16h and 20h. Therefore, there are six time points and eight biological conditions. As there are only two mice per biological condition, we decided not to take into account the effect of the regimen. This leads to 4 biological conditions with 4 mice in each. The dataset contains temporal expression data of 40327 genes. To illustrate the use of our package in \textbf{case 3} with more than two biological conditions, we take 500 genes, over the global 40327 genes in the original dataset. This sub dataset is saved in the file \textbf{RawCounts\_Weger2021\_MOUSEsub500}. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \section{Preamble}\label{sec:preamble} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{R version and R packages to install}\label{sec:Rversionpackage} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Before installing the necessary packages, you must install (or update) the R software in a version superior or equal to 4.2.1 "Funny-Looking Kid" (released on 2022/06/23) from \href{https://cran.r-project.org}{CRAN (Comprehensive R Archive Network)}. Then, in order to use the MultiRNAflow package, the following R packages must be installed: %%% \begin{itemize} \item From \href{https://cran.r-project.org}{CRAN}: \href{https://cran.r-project.org/web/packages/reshape2/index.html}{reshape2} (>= 1.4.4), \href{https://ggplot2.tidyverse.org}{ggplot2} (>= 3.4.0), \href{https://cran.r-project.org/web/packages/ggalluvial/index.html}{ggalluvial} (>= 0.12.3), \href{https://cran.r-project.org/web/packages/ggrepel/index.html}{ggrepel} (>= 0.9.2), \href{https://cran.r-project.org/web/packages/FactoMineR/index.html}{FactoMineR} (>= 2.6), \href{https://cran.r-project.org/web/packages/factoextra/index.html}{factoextra} (>= 1.0.7), \href{https://cran.r-project.org/web/packages/plot3D/index.html}{plot3D} (>= 1.4), \href{https://cran.r-project.org/web/packages/plot3Drgl/index.html}{plot3Drgl} (>= 1.0.3), \href{https://cran.r-project.org/web/packages/ggplotify/index.html}{ggplotify} (>= 0.1.2), \href{https://cran.r-project.org/web/packages/UpSetR/index.html}{UpSetR} (>= 1.4.0), \href{https://cran.r-project.org/web/packages/gprofiler2/index.html}{gprofiler2} (>= 0.2.1). % \item From \href{https://cran.r-project.org}{CRAN} and usually already included by default in R: graphics (>= 4.2.2), grDevices (>= 4.2.2), grid (>= 4.2.2), utils (>= 4.2.2), stats (>= 4.2.2). % \item From \href{https://bioconductor.org}{Bioconductor}: \href{https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html}{SummarizedExperiment} (>= 1.28.0), \href{https://bioconductor.org/packages/release/bioc/html/DESeq2.html}{DESeq2} (>= 1.38.1), \href{https://bioconductor.org/packages/release/bioc/html/ComplexHeatmap.html}{ComplexHeatmap} (>= 2.14.0), \href{https://www.bioconductor.org/packages/release/bioc/html/Mfuzz.html}{Mfuzz} (>= 2.58.0). % \end{itemize} Before installing a package, for instance the package FactoMineR, the user must check if the package is already installed with the command \texttt{library(FactoMineR)}. If the package has not been previously installed, the user must use the command \texttt{install.packages("FactoMineR")} (packages from CRAN). For beginners in programming, we recommend to follow the steps below for importing CRAN and Bioconductor packages. For the packages which must be download from CRAN, <>= Cran.pck <- c("reshape2", "ggplot2", "ggrepel", "ggalluvial", "FactoMineR", "factoextra", "plot3D", "plot3Drgl", "ggplotify", "UpSetR", "gprofiler2") @ the user can copy and paste the following lines of code for each package in order to download the missing packages. <>= Select.package.CRAN <- "FactoMineR" if (!require(package=Select.package.CRAN, quietly=TRUE, character.only=TRUE, warn.conflicts=FALSE)) { install.packages(pkgs=Select.package.CRAN, dependencies=TRUE) }# if(!require(package=Cran.pck[i], quietly=TRUE, character.only=TRUE)) @ If the package is already installed (for instance here "FactoMineR"), the previous lines of code will return nothing. For the packages which must be download from Bioconductor, <>= Bioconductor.pck <- c("SummarizedExperiment", "S4Vectors", "DESeq2", "Mfuzz", "ComplexHeatmap") @ the user must first copy and paste the following lines of code in order to install "BiocManager" <>= if (!require(package="BiocManager", quietly=TRUE, character.only=TRUE, warn.conflicts=FALSE)) { install.packages("BiocManager") }# if(!require(package="BiocManager", quietly=TRUE, character.only=TRUE)) @ then copy and paste the following lines of code in order to install the version 3.16 of bioconductor (it works with R version 4.2.0) <>= BiocManager::install(version="3.18") @ and then copy and paste the following lines of code for each package in order to download the missing packages. <>= Select.package.Bioc <- "DESeq2" if(!require(package=Select.package.Bioc, quietly=TRUE, character.only=TRUE, warn.conflicts=FALSE)){ BiocManager::install(pkgs=Select.package.Bioc) }## if(!require(package=Select.package.Bioc, quietly=TRUE, character.only=TRUE)) @ If the package is already installed (for instance here "DESeq2"), the previous lines of code will return nothing. Once all packages have been installed, the user may load our package. <>= library(MultiRNAflow) @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Main functions}\label{sec:MainFunctions} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Our package contains 38 functions among which 11 are main functions. The user should only use \begin{itemize} \item \textbf{DATAprepSE()} to store all information about the dataset in a standardized way (SummarizedExperiment class object) \item these main functions for exploratory data analysis (unsupervised analysis) % \begin{itemize} \item \textbf{DATAnormalization()}. This function allows to normalize raw counts data and the results will be used by the functions \textbf{PCAanalysis()}, \textbf{HCPCanalysis()},\newline \textbf{MFUZZanalysis()} and \textbf{DATAplotExpressionGenes()}. \item \textbf{PCAanalysis()}. The function realizes the PCA analysis. \item \textbf{HCPCanalysis()}. The function realizes the clustering analysis with the R package HCPC. \item \textbf{MFUZZanalysis()}. The function realizes the temporal clustering analysis with the R package Mfuzz \item \textbf{DATAplotExpressionGenes()}. This function allows to plot the profile expression of all chosen genes. \end{itemize} \end{itemize} \clearpage \begin{itemize} \item these main functions for supervised analysis % \begin{itemize} \item \textbf{DEanalysisGlobal()}. This function realizes the differential expression analysis. \item \textbf{DEplotVolcanoMA()}. The function plots and save all Volcano and MA plots from the output of \textbf{DEanalysisGlobal()}. \item \textbf{DEplotHeatmaps()}. The function plots a correlation heatmap and a heatmap of the normalized data for a selection of DE genes. \item \textbf{GSEAQuickAnalysis()}. The function realizes the GSEA analysis with the R package gprofiler2. \item \textbf{GSEApreprocessing()}. The function saves files to be used by 8 GSEA software and online tools. \end{itemize} % \end{itemize} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Load of the dataset}\label{sec:LoadDataset} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% If the user wants to use our package with one of the dataset included in MultiRNAflow, he must first write in the R console either <>= data("RawCounts_Antoszewski2022_MOUSEsub500") @ in order to load the \nameref{sec:dataAntoszewski2022}, either <>= data("RawCounts_Leong2014_FISSIONsub500wt") @ in order to load the \nameref{sec:dataLeong2014}, either <>= data("RawCounts_Schleiss2021_CLLsub500") @ in order to load the \nameref{sec:dataSchleiss2021}, or <>= data("RawCounts_Weger2021_MOUSEsub500") @ in order to load the \nameref{sec:dataWeger2021}. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Structure of the dataset}\label{sec:StructureDataset} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The dataset must be a data.frame containing raw counts data. If it is not the case, the function \textbf{DATAprepSE()} will stop and print an error. Each line should correspond to a gene, each column to a sample, except a particular column that may contain strings of characters describing the names of the genes. The first line of the data.frame should contain the names of the columns (strings of characters) that must have the following structure. <>= data("RawCounts_Leong2014_FISSIONsub500wt") colnames(RawCounts_Leong2014_FISSIONsub500wt) @ In this example, "Gene" indicates the column which contains the names of the different genes. The other column names contain all kind of information about the sample, including the biological condition, the time of measurement and the name of the individual (e.g patient, replicate, mouse, yeasts culture...). Other kinds of information can be stored in the column names (such as patient information), but they will not be used by the package. The various information in the column names must be separated by underscores. The order of these information is arbitrary but must be the same for all columns. For instance, the sample "wt\_t0\_r1" corresponds to the first replicate (r1) of the wild type yeast (wt) at time \(t_0\) (reference time).\newline The information located to the left of the first underscore will be considered to be in position 1, the information located between the first underscore and the second one will be considered to be in position 2, and so on. In the previous example, the biological condition is in position 1, the time is in position 2 and the replicate is in position 3.\newline In most of the functions of our package, the order of the previous information in the column names will be indicated with the inputs \texttt{Group.position}, \texttt{Time.position} and \texttt{Individual.position}. Similarly the input \texttt{Column.gene} will indicate the number of the column containing gene names. For example, in the previous dataset, the function \textbf{DATAprepSE()} must be called with the following arguments: <>= data("RawCounts_Leong2014_FISSIONsub500wt") @ <>= resSEexample <- DATAprepSE(RawCounts=RawCounts_Leong2014_FISSIONsub500wt, Column.gene=1, Group.position=NULL, Time.position=2, Individual.position=3) @ Here, the argument \texttt{Column.gene=1} means that the first column of the dataset contain genes name, \texttt{Time.position=2} means that the time of measurements is between the first and the second underscores in the columns names, \texttt{Individual.position=3} means that the name of the individual is between the second and the third underscores in the columns names and \texttt{Group.position=NULL} means that there is only one biological condition in the dataset (corresponding to \textbf{case 2}). Similarly, \texttt{Time.position=NULL} would mean that there is only one time of measurement for each individual (corresponding to \textbf{case 2}) and \texttt{Column.gene=NULL} would mean that there is no column containing gene names. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Structure of the output}\label{sec:StructureOutput} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The output of the main functions (see Section \ref{sec:MainFunctions}) is a \Rclass{SummarizedExperiment} class object. The following lines of this section and Figure~\ref{fig:SEchart} come from the vignette of the bioconductor package \href{https://bioconductor.org/packages/release/bioc/vignettes/SummarizedExperiment/inst/doc/SummarizedExperiment.html}{SummarizedExperiment}. The \Rclass{SummarizedExperiment} class is used to store rectangular matrices of experimental results, which are commonly produced by sequencing experiments such as RNA-Seq. Each \Rclass{SummarizedExperiment} object stores %%% \begin{itemize} \item the experiment data \texttt{SummarizedExperiment} (RNAseq raw counts data, normalized data ...) which can be retrieved with the R function \texttt{SummarizedExperiment::assays()} \item information and features of samples (phenotypes for instance) which can be retrieved with the R function \texttt{SummarizedExperiment::colData()} \item information and features of genes (length of genes, gene ontology, DE genes ...) which can be retrieved with the R function \texttt{SummarizedExperiment::rowData()} \item Meta-data (any other kind of information). \texttt{S4Vectors::metadata()} is just a simple list, so it is appropriate for any experiment wide metadata the user wishes to save, such as storing model formulas, description of the experimental methods, publication references ... \end{itemize} %%% We illustrate the main functions of the package and how to recover the different outputs of the package from the \Rclass{SummarizedExperiment} object int the following sections. %%% \begin{figure} \centering \includegraphics[width=0.75\linewidth,height=0.75\textheight]{MultiRNAflowImage/SE.pdf} \caption{Summarized Experiment structure} \label{fig:SEchart} \end{figure} \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \section{Detailed analysis of a dataset with several times point and two biological conditions (case 3)} \label{sec:DetailedAnalysis} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In this section we use the Chronic lymphocytic leukemia (CLL) subdataset\newline \textbf{RawCounts\_Schleiss2021\_CLLsub500} (see subsection \nameref{sec:dataSchleiss2021}) in order to explain the use of our package in \textbf{case 3} when there are only two biological conditions. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Preprocessing step with DATAprepSE()}\label{sec:DATAprepSEleuk} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The preprocessing step is realized by our R function \textbf{DATAprepSE()} to store all information about the dataset in a standardized way (\Rclass{SummarizedExperiment} class object). The user must realize this step in order to realize exploratory data analysis (unsupervised analysis, section \ref{sec:ExploratoryAnalysis_Leuk}) or statistical analysis of the transcriptional response (supervised analysis, section \ref{sec:SupervisedAnalysis_leuk}). The following lines of code realize the preprocessing step. <>= SEresleuk500 <- DATAprepSE(RawCounts=RawCounts_Schleiss2021_CLLsub500, Column.gene=1, Group.position=2, Time.position=4, Individual.position=3, VARfilter=0, SUMfilter=0, RNAlength=NULL) @ The inputs \Rcode{VARfilter} and \Rcode{SUMfilter} allow to filter the dataset by keeping only rows (i.e. genes) such as the sum or the variances of counts is greater than the selected threshold. The user can also filter genes by keeping only those which have a known transcript length with the input \texttt{RNAlength}. The function returns a \Rclass{SummarizedExperiment} class object containing %%% \begin{itemize} \item general information about the dataset \item information to be used for exploratory data analysis \item a \Rclass{DESeqDataSet} class object (\texttt{DESeq2obj}) to be used for statistical analysis of the transcriptional response. \end{itemize} <>= names(S4Vectors::metadata(SEresleuk500)) str(S4Vectors::metadata(SEresleuk500)$Results) @ All results of the different analysis presented below will be stored in \texttt{Results} of \texttt{S4Vectors::metadata()}. We will show in each section how to retrieve information and plots. Write \texttt{?DATAprepSE} in your console for more information about the function. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Exploratory data analysis (unsupervised analysis)} \label{sec:ExploratoryAnalysis_Leuk} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Normalization with DATAnormalization()} \label{sec:DATAnormalizationLeuk} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The following lines of code realize the normalization step from the results of the function \textbf{DATAprepSE()} (subsection \ref{sec:DATAprepSEleuk}) <>= SEresNORMleuk500 <- DATAnormalization(SEres=SEresleuk500, Normalization="vst", Blind.rlog.vst=FALSE, Plot.Boxplot=FALSE, Colored.By.Factors=TRUE, Color.Group=NULL, path.result=NULL) @ If \texttt{Plot.Boxplot=TRUE} a boxplot showing the distribution of the normalized expression\newline (\texttt{Normalization="vst"} means that the vst method is used) of genes for each sample is returned. If the user gives information about transcript length with the input \texttt{RNAlength} of the function \textbf{DATAprepSE} (section~\ref{sec:DATAprepSEleuk}), the user can set \texttt{Normalization="rpkm"} to normalize the dataset with the RPKM formula. %%% If the user wants to see the results of the normalization, he must first executing the following lines of code. <>= ## Save 'Results' of the metadata in an object resleuk500 <- S4Vectors::metadata(SEresNORMleuk500)$Results ## Save the results of Normalization in an object resNORMleuk500 <- resleuk500[[1]][[1]] ### names(S4Vectors::metadata(SEresNORMleuk500)) str(resleuk500, max.level=3) @ The user can see that \texttt{UnsupervisedAnalysis\$Normalization} is no longer \texttt{NULL} and now contains two elements : the method of normalization used (\texttt{resNORMleuk500\$normMethod}) and the boxplot (\texttt{resNORMleuk500\$normBoxplot}). Boxplots showing the results of normalization can be plotted as follows. <>= resNORMleuk500$normMethod print(resNORMleuk500$normBoxplot) @ The user can now have access to the normalized data. <>= normData <- SummarizedExperiment::assays(SEresNORMleuk500) names(SummarizedExperiment::assays(SEresNORMleuk500)) @ "counts" represents the raw counts data and "vst" the normalized data. The user can look at the normalized data by executing the line \texttt{normData\$vst} or \texttt{normData[[2]]}. If \texttt{Colored.By.Factors=TRUE}, the color of the boxplots would be different for different biological conditions. By default (if \texttt{Color.Group=NULL}), a color will be automatically applied for each biological condition. You can change the colors by creating the following data.frame <>= colorLeuk <- data.frame(Name=c("NP", "P"), Col=c("black", "red")) @ and setting \texttt{Color.Group=colorLeuk}. The x-labels give biological information, time information and individual information separated by dots. If the user wants to see the 6th first rows of the normalized data, he can write in his console\newline \texttt{head(SEresNORMleuk500\$NormalizedData, n=6)}. The user can save the graph in a folder thanks to the input path.result. If \texttt{path.result=NULL} the results will still be plotted but not saved in a folder. Write \texttt{?DATAnormalization} in your console for more information about the function. \noindent \textbf{Interpretation of the results}: When data are not normalized, boxplots of unnormalized log expression data show large differences in distribution between different samples.\newline The figure shows that the normalized gene expression distribution of the different samples is similar. Normalization has therefore been successful, and we can now begin exploratory analysis of the dataset. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Factorial analysis: PCA with PCAanalysis() and clustering with HCPCanalysis()} \label{sec:FactorialLeuk} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{PCA (case 3)}\label{sec:PCALeuk} %%---------------------------------------------------------------------------%% When samples belong to different biological conditions and different time points, the following lines of code return from the results of the function \textbf{DATAnormalization()} (Section~\ref{sec:DATAnormalizationLeuk}): %%%%%%%%%%% \begin{itemize} \item The results of the R function \Rfunction{PCA()} from the package FactoMineR. \item one 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}) where samples are colored with different colors for different biological conditions. Furthermore, lines are drawn between each pair of consecutive points for each individual (if \texttt{Mean.Accross.Time=FALSE}, otherwise lines will be drawn only between mean values of all individuals for each time point and biological conditions). \item one 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}) for each biological condition, where samples are colored with different colors for different time points. Furthermore, lines are drawn between each pair of consecutive points for each sample (if \texttt{Mean.Accross.Time=FALSE}, otherwise lines will be drawn only between mean values of all individuals for each time point and biological conditions). \item the same graphs described above but without lines. \end{itemize} %%%%%%%%%%% <>= SEresPCALeuk500 <- PCAanalysis(SEresNorm=SEresNORMleuk500, gene.deletion=NULL, sample.deletion=NULL, Plot.PCA=FALSE, Mean.Accross.Time=FALSE, Color.Group=NULL, Cex.label=0.9, Cex.point=0.8, epsilon=0.2, Phi=25, Theta=140, motion3D=FALSE, path.result=NULL, Name.folder.pca=NULL) @ The graphs are %%%%%%%%%%% \begin{itemize} \item stored in an SE object (see below how to visualize the elements) \item displayed if \texttt{Plot.PCA=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%% If the user wants to see the results of the PCA analysis, he must first execute the following lines of code. <>= ## Save 'Results' of the metadata in an object resleuk500 <- S4Vectors::metadata(SEresPCALeuk500)$Results ## Save the results of normalization in an object resPCALeuk500 <- resleuk500[[1]][[2]] ### names(S4Vectors::metadata(SEresPCALeuk500)) str(resleuk500, max.level=3, give.attr=FALSE) names(resPCALeuk500) @ The user can see that \texttt{UnsupervisedAnalysis\$PCA} is no longer \texttt{NULL} and now contains 9 elements : %%%%%%%%%%% \begin{itemize} \item The results of PCA (\texttt{PCAresults}) which can be retrieved by executing the following line of code in your console \texttt{resPCALeuk500\$PCAresults} \item \texttt{resPCALeuk500\$PCA\_2D} and \texttt{resPCALeuk500\$PCA\_2DtemporalLinks} contain respectively 2D PCA with all biological conditions without and with temporal links. The following lines of codes plot the 2D PCA with temporal links \end{itemize} %%%%%%%%%%% <>= print(resPCALeuk500$PCA_2DtemporalLinks) @ \begin{itemize} \item \texttt{resPCALeuk500\$PCA\_3D} and \texttt{resPCALeuk500\$PCA\_3DtemporalLinks} contain respectively 3D PCA with all biological conditions without and with temporal links. The following lines of codes plot the 3D PCA with temporal links \end{itemize} <>= print(resPCALeuk500$PCA_3DtemporalLinks) @ \begin{itemize} \item \texttt{resPCALeuk500\$PCA\_BiologicalCondition\_NP} and \texttt{resPCALeuk500\$PCA\_BiologicalCondition\_P} contain the different 2D and 3D PCA plots for each biological condition. The following lines of codes plot the 3D PCA with temporal links returns the 3D PCA with temporal links for the biological condition P. \end{itemize} <>= print(resPCALeuk500$PCA_BiologicalCondition_P$PCA_3DtemporalLinks) @ By default (if \texttt{Color.Group=NULL}), a color will be automatically assigned to each biological condition. The user can change the colors by creating the following data.frame <>= colorLeuk <- data.frame(Name=c("NP","P"), Col=c("black","red")) @ and setting \texttt{Color.Group=colorLeuk}. The user cannot change the color associated to each time point. If the user wants to delete, for instance, the genes 'ABCA7' and 'ADAM28' (respectively the second and sixth gene) and/or delete the samples 'CLL\_P\_r1\_t1' and 'CLL\_P\_r2\_t2', he can set %%% \begin{itemize} \item \texttt{gene.deletion=c("ABCA7","ADAM28")} and/or\newline \texttt{sample.deletion=c("CLL\_P\_r1\_t1", "CLL\_P\_r2\_t2")} \item \texttt{gene.deletion=c(2, 6)} and/or \texttt{sample.deletion=c(3, 13)}.\newline The integers in \texttt{gene.deletion} and \texttt{sample.deletion} represent respectively the row numbers and the column numbers of \texttt{RawCounts} where the selected genes and samples are located. \end{itemize} Write \texttt{?PCAanalysis} in your console for more information about the function. \noindent \textbf{Interpretation of the results}: The three graphs clearly show that PCA has made it possible to discern the temporal evolution of transcription within each biological condition. The first two graphs also clearly distinguishes the samples according to the biological conditions. %%---------------------------------------------------------------------------%% \subsubsubsection{HCPC (case 3)}\label{sec:HCPCLeuk} %%---------------------------------------------------------------------------%% The lines of code below return from the results of the function \textbf{DATAnormalization()} (see Section~\ref{sec:DATAnormalizationLeuk}): %%%%%%%%%%% \begin{itemize} \item The results of the R function \Rfunction{HCPC()} from the package FactoMineR. \item A dendrogram \item A graph indicating by color for each sample, its cluster, the biological condition and the time point associated to the sample. \item One 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}). These PCA graphs are identical to the outputs of \texttt{PCAanalysis()} with all samples but samples are colored with different colors for different clusters. \end{itemize} %%%%%%%%%%% The input \texttt{SEresNorm} can be either %%%%%%%%%%% \begin{itemize} \item \texttt{SEresNorm=SEresNORMleuk500} and the results of \texttt{HCPCanalysis()} will be added in the SE object which contains the results of \texttt{DATAnormalization()} \item or \texttt{SEresNorm=SEresPCALeuk500} and the results of \texttt{HCPCanalysis()} will be added in the SE object which contains the results of \texttt{DATAnormalization()} and \texttt{PCAanalysis()}. \end{itemize} %%%%%%%%%%% <>= SEresHCPCLeuk500 <- HCPCanalysis(SEresNorm=SEresPCALeuk500, gene.deletion=NULL, sample.deletion=NULL, Plot.HCPC=FALSE, Phi=25,Theta=140, Cex.point=0.7, epsilon=0.2, Cex.label=0.9, motion3D=FALSE, path.result=NULL, Name.folder.hcpc=NULL) @ The graphs are %%%%%%%%%%% \begin{itemize} \item stored in an SE object (see below how to visualize the elements) \item displayed if \texttt{Plot.HCPC=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%% %% The optimal number of clusters is automatically computed by the R function %% \Rfunction{HCPC()}. If the user wants to see the results of the HCPC analysis, he must first execute the following lines of code. <>= ## Save 'Results' of the metadata in an object resleuk500 <- S4Vectors::metadata(SEresHCPCLeuk500)$Results ## Save the results of HCPC in an object resHCPCLeuk500 <- resleuk500[[1]][[3]] ### names(S4Vectors::metadata(SEresHCPCLeuk500)) str(resleuk500, max.level=3, give.attr=FALSE) names(resHCPCLeuk500) @ The user can see that \texttt{UnsupervisedAnalysis\$HCPC} is no longer \texttt{NULL} and now contains 6 elements : %%%%%%%%%%% \begin{itemize} \item The results of HCPC (\texttt{resHCPC}) can be retrieved by executing the following line of code in your console \texttt{resHCPCLeuk500\$resHCPC} \item \texttt{resHCPCLeuk500\$resHCPCLeuk500} contains the dendrogram. The following lines of codes plot the dendrogram. \end{itemize} %%%%%%%%%%% <>= print(resHCPCLeuk500$Dendrogram) @ \begin{itemize} \item \texttt{resHCPCLeuk500\$Cluster\_SampleDistribution} contains the graph indicating for each sample, its cluster, the biological condition and the time point associated to the sample, using a color code. The following lines of codes plot the 3D PCA with temporal links \end{itemize} <>= print(resHCPCLeuk500$Cluster_SampleDistribution) @ \begin{itemize} \item \texttt{resPCALeuk500\$PCA2DclustersHCPC} and \texttt{resPCALeuk500\$PCA3DclustersHCPC} contain the 2D and 3D PCA plots where samples are colored with different colors for different clusters. The following lines of codes plot the 3D PCA. \end{itemize} <>= print(resHCPCLeuk500$PCA3DclustersHCPC) @ Write \texttt{?HCPCanalysis} in your console for more information about the function. \noindent \textbf{Interpretation of the results}: This HCPC analysis shows that %%%%%%%%%%% \begin{itemize} \item cluster 1 contains all samples taken at measurement times \(t_0\), \(t_1\) and \(t_2\) of both biological conditions. \item cluster 2 contains all samples taken at measurement times \(t_3\), \(t_4\), \(t_5\) and \(t_6\) of both biological conditions. \item cluster 3 contains all samples taken at measurement times \(t_7\) and \(t_8\) of both biological conditions. \end{itemize} %%%%%%%%%%% This indicates that the expression of genes at these three groups of times are very different. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Temporal clustering analysis with MFUZZanalysis()} \label{sec:MFUZZleuk} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The following function realizes the temporal clustering analysis. It takes as input, a number of clusters (\texttt{DataNumberCluster}) that can be chosen automatically if \texttt{DataNumberCluster=NULL} and the results of the function \textbf{DATAnormalization()} (see Section~\ref{sec:DATAnormalizationLeuk}). The lines of code below return for each biological condition %%%%%%%%%%% \begin{itemize} \item the summary of the results of the R function \Rfunction{mfuzz()} from the package Mfuzz. \item the scaled height plot, computed with the \Rfunction{HCPC()} function, and shows the number of clusters chosen automatically (if \texttt{DataNumberCluster=NULL}). If \texttt{Method="hcpc"}, the function plots the scaled within-cluster inertia, but if \texttt{Method="kmeans"}, the function plots the scaled within-cluster inertia. As the number of genes can be very high, we recommend to select \texttt{Method="hcpc"} which is by default. \item the output graphs from the R package \texttt{Mfuzz} showing the most common temporal behavior among all genes for each biological condition. The plots below correspond to the biological condition 'P'. \end{itemize} %%%%%%%%%%% The input \texttt{SEresNorm} can be either %%%%%%%%%%% \begin{itemize} \item \texttt{SEresNorm=SEresNORMleuk500} and the results of \texttt{MFUZZanalysis()} will be added in the SE object which contains the results of \texttt{DATAnormalization()} \item either \texttt{SEresNorm=SEresPCALeuk500} and the results of \texttt{MFUZZanalysis()} will be added in the SE object which contains the results of \texttt{DATAnormalization()} and \texttt{PCAanalysis()} \item or \texttt{SEresNorm=SEresHCPCLeuk500} and the results of \texttt{MFUZZanalysis()} will be added in the SE object which contains the results of \texttt{DATAnormalization()}, \texttt{PCAanalysis()} and \texttt{HCPCanalysis()}. \end{itemize} %%%%%%%%%%% <>= SEresMfuzzLeuk500 <- MFUZZanalysis(SEresNorm=SEresHCPCLeuk500, DataNumberCluster=NULL, Method="hcpc", Membership=0.7, Min.std=0.1, Plot.Mfuzz=FALSE, path.result=NULL, Name.folder.mfuzz=NULL) @ The excluded genes are those which standard deviation are under a certain threshold (the R function \Rfunction{mfuzz()} from the package Mfuzz). The graphs are %%%%%%%%%%% \begin{itemize} \item stored in an SE object (see below how to visualize the elements) \item displayed if \texttt{Plot.Mfuzz=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%% If the user wants to see the results of the Mfuzz analysis, he must first execute the following lines of code. <>= ## Save 'Results' of the metadata in an object resleuk500 <- S4Vectors::metadata(SEresMfuzzLeuk500)$Results ## Save the results of Mfuzz in an object resMfuzzLeuk500 <- resleuk500[[1]][[4]] ### names(S4Vectors::metadata(SEresMfuzzLeuk500)) str(resleuk500, max.level=3, give.attr=FALSE) names(resMfuzzLeuk500) @ The user can see that \texttt{UnsupervisedAnalysis\$HCPC} is no longer \texttt{NULL} and now contains 6 elements : %%% \begin{itemize} \item The user can execute the command \texttt{resMfuzzLeuk500\$DataClustSel} to see the number of cluster associated to each biological condition. \item The user can execute the command \texttt{head(resMfuzzLeuk500\$Data.Mfuzz)} in order to see the data used for the Mfuzz analysis, and \texttt{head(resMfuzzLeuk500\$Result.Mfuzz)} in order to see for each gene, the temporal cluster associated to each biological condition. \item \texttt{resMfuzzLeuk500\$ClustersNumbers} contains the graph indicating the selected cluster for each biological condition. The following lines of codes plot this graph. \end{itemize} <>= print(resMfuzzLeuk500$ClustersNumbers) @ \begin{itemize} \item \texttt{resMfuzzLeuk500\$Mfuzz.Plots.Group\_NP} and \texttt{resMfuzzLeuk500\$Mfuzz.Plots.Group\_P} contains the different temporal clusters for respectively the biological condition P and NP. The following lines of codes plot the temporal clusters for the biological condition P. \end{itemize} <>= print(resMfuzzLeuk500$Mfuzz.Plots.Group_P) @ Other temporal information are shown in the alluvial graph of the subsection \nameref{sec:DEanalysisLeuk} %% \nameref{sec:DEanalysis} that can be compared with the previous graphs. Write \texttt{?MFUZZanalysis} in your console for more information about the function. \noindent \textbf{Interpretation of the results}: We observe that the biological condition P admits at least 3 important clusters. Clusters 2 and 3 are interesting because they show either a significant peak at time \(t_3\) or an abrupt change in behavior. This suggests that cell activation causes a significant change in gene expression at time \(t_3\). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Genes expression profile with DATAplotExpressionGenes()} \label{sec:PlotExpressionLeuk} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The lines of code below allow to plot, from the results of the function \textbf{DATAnormalization()} (see Section~\ref{sec:DATAnormalizationLeuk}), for each biological condition: the evolution of the 25th gene expression of the three replicates across time and the evolution of the mean and the standard deviation of the 25th gene expression across time. The color of the different lines are different for different biological conditions. The input \texttt{SEresNorm} can be either %%%%%%%%%%% \begin{itemize} \item \texttt{SEresNorm=SEresNORMleuk500} and the results of \texttt{DATAplotExpressionGenes()} will be added in the SE object which contains the results of \texttt{DATAnormalization()} \item either \texttt{SEresNorm=SEresPCALeuk500} and the results of \texttt{DATAplotExpressionGenes()} will be added in the SE object which contains the results of \texttt{DATAnormalization()} and \texttt{PCAanalysis()} \item either \texttt{SEresNorm=SEresHCPCLeuk500} and the results of \texttt{DATAplotExpressionGenes()} will be added in the SE object which contains the results of \texttt{DATAnormalization()}, \texttt{PCAanalysis()} and \texttt{HCPCanalysis()}. \item or \texttt{SEresNorm=SEresMfuzzLeuk500} and the results of \texttt{DATAplotExpressionGenes()} will be added in the SE object which contains the results of \texttt{DATAnormalization()}, \texttt{PCAanalysis()}, \texttt{HCPCanalysis()} and \texttt{MFUZZanalysis()}. \end{itemize} %%%%%%%%%%% <>= SEresEVOleuk500 <- DATAplotExpressionGenes(SEresNorm=SEresMfuzzLeuk500, Vector.row.gene=c(25, 30), Color.Group=NULL, Plot.Expression=FALSE, path.result=NULL, Name.folder.profile=NULL) @ The graphs are %%%%%%%%%%% \begin{itemize} \item stored in an SE object (see below how to visualize the elements) \item displayed if \texttt{Plot.Expression=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%% If the user wants to see the results of the DATAplotExpressionGenes(), he must first execute the following lines of code. <>= ## Save 'Results' of the metadata in an object resleuk500 <- S4Vectors::metadata(SEresEVOleuk500)$Results ## Save the results of DE analysis in an object resEVOleuk500 <- resleuk500[[1]][[5]] ### names(S4Vectors::metadata(SEresEVOleuk500)) str(resleuk500, max.level=3, give.attr=FALSE) names(resEVOleuk500) @ \begin{itemize} \item \texttt{resEVOleuk500\$ARL4C\_profile} or \texttt{resEVOleuk500[[1]]} contains the graph or expression profile of gene ARL4C\_profile, and \texttt{resEVOleuk500\$ARL4C\_profile} or \texttt{resEVOleuk500[[2]]} contains the graph or expression profile of gene ATP5G1\_profile \end{itemize} <>= print(resEVOleuk500$ARL4C_profile) @ By default (if \texttt{Color.Group=NULL}), a color will be automatically assigned to each biological condition. The user can change the colors by creating the following data.frame <>= colorLeuk <- data.frame(Name=c("NP", "P"), Col=c("black", "red")) @ and setting \texttt{Color.Group=colorLeuk}. %%%%%%%%% If the user wants to select several genes, for instance the 97th, the 192th, the 194th and the 494th, he needs to set \texttt{Vector.row.gene=c(97,192,194,494)}.\newline %%%%%%%%% Write \texttt{?DATAplotExpressionGenes} in your console for more information about the function. \noindent \textbf{Interpretation of the results}: The expression profile of the ARL4C gene is interesting, since the behavior of this gene is similar in both biological conditions over the first 4 time points (up to \(t_3\)). From \(t_4\) to \(t_8\), expression of the normalized gene is much lower in the P biological condition. Stimulation of the P cell membrane therefore cascades to modify the expression of several genes, resulting in inhibition of ARL4C gene expression. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Statistical analysis of the transcriptional response (supervised analysis)} \label{sec:SupervisedAnalysis_leuk} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{DE analysis with DEanalysisGlobal()}\label{sec:DEanalysisLeuk} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The lines of code below %%%%%%%%%%% \begin{itemize} \item returns a data.frame. See subsection \nameref{sec:DEsummaryLeuk} \item plots the following graphs %%%%% \begin{itemize} \item \emph{Results from the temporal statistical analysis} (\textbf{case 2} for each biological condition). See subsection \nameref{sec:DEleukGRAPHStemporel} \item \emph{Results from the statistical analysis by biological condition} (\textbf{case 1} for each fixed time). See subsection \nameref{sec:DEleukGRAPHSbiocond}. \item \emph{Results from the combination of temporal and biological statistical analysis}. See subsection \nameref{sec:DEleukGRAPHSboth} \end{itemize} %%%%% \end{itemize} %%%%%%%%%%% The input \texttt{SEres} can be either %%%%%%%%%%% \begin{itemize} \item \texttt{SEres=SEresleuk500}, and the results of \texttt{DEanalysisGlobal()} will be stored in the initial SE object \item either \texttt{SEres=SEresNORMleuk500} and the results of \texttt{DEanalysisGlobal()} will be added in the SE object which contains the results of \texttt{DATAnormalization()} \item either \texttt{SEres=SEresPCALeuk500} and the results of \texttt{DEanalysisGlobal()} will be added in the SE object which contains the results of \texttt{DATAnormalization()} and \texttt{PCAanalysis()} \item either \texttt{SEres=SEresHCPCLeuk500} and the results of \texttt{DEanalysisGlobal()} will be added in the SE object which contains the results of \texttt{DATAnormalization()}, \texttt{PCAanalysis()} and \texttt{HCPCanalysis()}. \item either \texttt{SEres=SEresMfuzzLeuk500} and the results of \texttt{DEanalysisGlobal()} will be added in the SE object which contains the results of \texttt{DATAnormalization()}, \texttt{PCAanalysis()}, \texttt{HCPCanalysis()} and \texttt{MFUZZanalysis()}. \item or \texttt{SEres=SEresEVOleuk500} and the results of \texttt{DEanalysisGlobal()} will be added in the SE object which contains the results of \texttt{DATAnormalization()}, \texttt{PCAanalysis()}, \texttt{HCPCanalysis()}, \texttt{MFUZZanalysis()} and \texttt{DATAplotExpressionGenes()}. \end{itemize} %%%%%%%%%%% <>= SEresDELeuk500 <- DEanalysisGlobal(SEres=SEresEVOleuk500, pval.min=0.05, pval.vect.t=NULL, log.FC.min=1, LRT.supp.info=FALSE, Plot.DE.graph=FALSE, path.result=NULL, Name.folder.DE=NULL) ## data("Results_DEanalysis_sub500") ## SEresDELeuk500 <- Results_DEanalysis_sub500$DE_Schleiss2021_CLLsub500 @ Due to time consuming of the DE analysis, we stored in the object \texttt{Results\_DEanalysis\_sub500} (uncommented lines) a list of three objects %%%%%%%%%%% \begin{itemize} \item \texttt{Results\_DEanalysis\_sub500\$DE\_Schleiss2021\_CLLsub500}, stored the results of \Rfunction{DEanalysisGlobal()} with \texttt{RawCounts\_Schleiss2021\_CLLsub500}. \item \texttt{Results\_DEanalysis\_sub500\$DE\_Antoszewski2022\_MOUSEsub500}, stored the results of \Rfunction{DEanalysisGlobal()} with \texttt{RawCounts\_Antoszewski2022\_MOUSEsub500}. \item \texttt{Results\_DEanalysis\_sub500\$DE\_Leong2014\_FISSIONsub500wt}, stored the results of \Rfunction{DEanalysisGlobal()} with \texttt{RawCounts\_Leong2014\_FISSIONsub500wt}. \end{itemize} %%%%%%%%%%% The graphs are %%%%%%%%%%% \begin{itemize} \item stored in an SE object (see below how to visualize the elements) \item displayed if \texttt{Plot.DE.graph=TRUE} (see the following subsections \ref{sec:DEleukGRAPHStemporel}, \ref{sec:DEleukGRAPHSbiocond} and \ref{sec:DEleukGRAPHSboth}) \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%% Write \texttt{?DEanalysisGlobal} in your console for more information about the function. %%---------------------------------------------------------------------------%% \subsubsubsection{Data.frame summarizing all the DE analysis (case 3)} \label{sec:DEsummaryLeuk} %%---------------------------------------------------------------------------%% The output data.frame can be extracted with the following line of code, <>= DEsummaryLeuk <- SummarizedExperiment::rowData(SEresDELeuk500) @ As we use abbreviated column names, we propose a glossary in order to help the user to understand meaning of each column. The glossary of the column names can be extracted with the following lines of code, <>= resDELeuk500 <- S4Vectors::metadata(SEresDELeuk500)$Results[[2]][[2]] resGlossaryLeuk <- resDELeuk500$Glossary @ and then write \texttt{DEsummaryLeuk} and \texttt{resGlossaryLeuk} in the R console. The data.frame \texttt{DEsummaryLeuk} contains %%%%%%% \begin{itemize} \item gene names (column 1) \item \emph{Results from the temporal statistical analysis} (\textbf{case 2} for each biological condition) %%% \begin{itemize} \item pvalues, log2 fold change and DE genes between each time \(t_i\) versus the reference time \(t_0\), for each biological condition (\( 3 \times (T-1) \times N_{bc}=3 \times 8 \times 2=48\) columns). \item \(N_{bc}=2\) binary columns (1 and 0), one per biological condition (with \(N_{bc}\) the number of biological conditions). A '1' in one of these two columns means that the gene is DE at least between one time \(t_i\) versus the reference time \(t_0\), for the biological condition associated to the column (see Section~\ref{sec:DEleukGRAPHStemporel}). \item \(N_{bc}=2\) columns where each element is succession of 0 and 1, one per biological condition. The positions of '1,' in one of these two columns, indicate the set of times \(t_i\) such that the gene is DE between \(t_i\) and the reference time \(t_0\), for the biological condition associated to the column. So each element of the column is what we called previously, a temporal pattern. \end{itemize} %%% \item \emph{Results from the statistical analysis by biological condition} (\textbf{case 1} for each fixed time) %%% \begin{itemize} \item pvalues, log2 fold change and DE genes between each pairs of biological conditions, for each fixed time. (\( 3 \times \frac{N_{bc} \times (N_{bc}-1)}{2} \times T= 3 \times 1 \times 9 = 27\) columns). \item \(T=9\) binary columns (1 and 0), one per time. A '1' in one of these columns, means that the gene is DE between at least one pair of biological conditions, for the fixed time associated to the column. \item \(N_{bc} \times T=2\times 9=18\) binary columns, which give the specific genes for each biological condition at each time \(t_i\). A '1' in one of these columns means that the gene is specific to the biological condition at the time associated to the column. '0' otherwise. A gene is called specific to a given biological condition BC1 at a time \(t_i\), if the gene is DE between BC1 and any other biological conditions at time \(t_i\), but not DE between any pair of other biological conditions at time \(t_i\). \item \(N_{bc} \times T=2\times 9=18\) columns filled with -1, 0 or 1. A '1' in one of these columns means that the gene is up-regulated (or over-expressed) for the biological condition at the time associated to the column. A gene is called up-regulated for a given biological condition BC1 at time \(t_i\) if the gene is specific to the biological condition BC1 at time \(t_i\) and expressions in BC1 at time \(t_i\) are higher than in the other biological conditions at time \(t_i\). A '-1' in one of these columns means that the gene is down-regulated (or under-expressed) for the biological condition at the time associated to the column. A gene is called down-regulated for a given biological condition at a time \(t_i\) BC1 if the gene is specific to the biological condition BC1 at time \(t_i\) and expressions in BC1 at time \(t_i\) are lower than in the other biological conditions at time \(t_i\). A '0' in one of these columns means that the gene is not specific to the biological condition at the time associated to the column. \item \(N_{bc}=2\) binary columns (1 and 0). A '1' in one of these columns means the gene is specific at least at one time \(t_i\), for the biological condition associated to the column. '0' otherwise. \end{itemize} \item \emph{Results from the combination of temporal and biological statistical analysis} %%% \begin{itemize} \item \(N_{bc} \times T=2\times 9=18\) binary columns, which give the signature genes for each biological condition at each time \(t_i\). A '1' in one of these columns means that the gene is a signature gene to the biological condition at the time associated to the column. '0' otherwise. A gene is called signature of a biological condition BC1 at a given time \(t_i\), if the gene is specific to the biological condition BC1 at time \(t_i\) and DE between \(t_i\) versus the reference time \(t_0\) for the biological condition BC1. \item \(N_{bc}=2\) binary columns (1 and 0). A '1' in one of these columns means the gene is signature at least at one time \(t_i\), for the biological condition associated to the column. '0' otherwise. %%% \end{itemize} %%% \end{itemize} %%---------------------------------------------------------------------------%% \subsubsubsection{Graphs from the results of the temporal statistical analysis} \label{sec:DEleukGRAPHStemporel} %%---------------------------------------------------------------------------%% From the temporal statistical analysis, the user can plot the following graphs. \noindent \emph{\(N_{bc}=2\) alluvial graphs}, one per biological condition (with \(N_{bc}\) the number of biological conditions). The code below prints the alluvial graph for the biological condition P. <>= print(resDELeuk500$DEplots_TimePerGroup$Alluvial.graph.Group_P) @ %%%%%%%%%%%%%%%%%%% \begin{description} \item[Description] The x-axis of the graph is labeled with all times except \(t_0\). For each vertical barplot, there are two strata: 1 and 0 whose sizes indicate respectively the number of DE genes and of non DE genes, between the time corresponding to the barplot and the reference time \(t_0\). %%%%%%% The alluvial graph is composed of curves, each corresponding to a single gene, which are gathered in alluvia. An alluvium is composed of all genes having the same curve: for example, an alluvium going from the stratum 0 at time \(t_1\) to the stratum 1 at time \(t_2\) corresponds to the set of genes which are not DE at \(t_1\) and are DE at time \(t_2\). Each alluvium connects pairs of consecutive barplots and its thickness gives the number of genes in the alluvium. %%%%%%% The color of each alluvium indicates the temporal group, defined as the set of genes which are all first DE at the same time with respect to the reference time \(t_0\). \item[Interpretation of the results] The alluvial graph can be used to determine the number of DE genes at least at one time for a given biological condition, since this is the size of each barplot. Using these graphs alone (one for each biological condition), we can compare the number of DE genes at least one time for all biological conditions. This graph also provides information on all temporal patterns and also the number of genes present in each of these patterns. According to the graph, the two most important DE temporal patterns for biological condition P are: "00111111" and "00000011". This allows us to deduce that times \(t_3\) and \(t_7\) are very important. This can also be visualized by the size of each alluvium at time \(t_1\) which gives the number of genes in each time group. \end{description} %%%%%%%%%%%%%%%%%%% \noindent \emph{\(N_{bc}=2\) graphs} showing the number of DE genes as a function of time for each temporal group, one per biological condition. By temporal group, we mean the sets of genes which are first DE at the same time. The code below prints the graph for the biological condition P. <>= print(resDELeuk500$DEplots_TimePerGroup$NumberDEgenes.acrossTime.perTemporalGroup.Group_P) @ %%%%%%%%%%%%%%%%%%% \begin{description} \item[Description] The graphs plot the time evolution of the number of DE genes within each temporal group, one per biological condition. The x-axis labels indicate all times except \(t_0\). %%%%%%% \item[Interpretation of the results] This graph confirms the importance of time groups \(t_3\) and \(t_7\). Genes belonging to time groups \(t_1\) and \(t_2\) seem to correspond to genes upstream of the activation cascade after stimulation of the cell membrane, and have a direct impact on hub genes which activate numerous biological pathways. This would explain the very large number of genes belonging to the time group \(t_3\). Time \(t_7\) corresponds to genes downstream of the activation cascade, and is probably largely made up of genes with a direct or indirect role in cell proliferation. This can be checked using our GO tools (see Section~\ref{sec:GOenrichmentLeuk}). \end{description} %%%%%%%%%%%%%%%%%%% \noindent \emph{One barplot} showing the number of DE genes up-regulated and down-regulated per time, for each biological condition. The graph shows the number of DE genes per time, for each biological condition. The x-axis labels indicate all times except \(t_0\). The code below prints the barplot. <>= print(resDELeuk500$DEplots_TimePerGroup$NumberDEgenes_UpDownRegulated_perTimeperGroup) @ %%%%%%%%%%%%%%%%%%% \begin{description} \item[Description] For each DE gene, we compute the sign of the log2 fold change between time \(t_i\) and time \(t_0\). If the sign is positive (resp. negative), the gene is categorized as up-regulated (resp. down-regulated). In the graph, the up-regulated (resp. down-regulated) genes are indicated in orange (resp. in light blue). %%%%%%% \item[Interpretation of the results] This graph shows that the number of DE genes is greater in biological condition P at all times, especially at times \(t_1\), \(t_3\) and \(t_7\). This means that in many biological pathways, there are far more genes activated or inhibited in the biological condition P, and these are the genes that interest biologists. \end{description} %%%%%%%%%%%%%%%%%%% \noindent \emph{\(2\times N_{bc}=4\) upset graphs}, realized with the R package UpSetR \cite{Conway2017UpSetR}, showing the number of DE genes belonging to each DE temporal pattern, for each biological condition. By temporal pattern, we mean the set of times \(t_i\) such that the gene is DE between \(t_i\) and the reference time \(t_0\). <>= print(resDELeuk500$DEplots_TimePerGroup$VennBarplot.withNumberUpRegulated.Group_P) @ %%%%%%%%%%%%%%%%%%% \begin{description} \item[Description] The graphs plot the number of genes in each DE temporal pattern in a Venn barplot, one per biological condition. By DE temporal pattern, we mean a subset of times in \(t_1, \ldots, t_n\). We say that a gene belongs to a DE temporal pattern if the gene is DE versus \(t_0\) only at the times in this DE temporal patterns. For each gene in a given DE temporal pattern, we compute the number of DE times where it is up-regulated and we use a color code in the Venn barplot to indicate the number of genes in a temporal pattern that are up-regulated a given number of times (dark blue if it is always down-regulated, lighter blue if it is up-regulated only once, \textit{etc}). For example, %%% \begin{itemize} \item the size of the dark blue barplot of the first barplot ("ti.vs.t0\_0up") gives the number of genes which are DE between \(t_3\) versus \(t_0\), \(t_4\) versus \(t_0\), ..., \(t_7\) versus \(t_0\) and \(t_8\) versus \(t_0\) and always down-regulated at each of this six times compared to \(t_0\) \item the size of the light blue barplot of the first ninth ("ti.vs.t0\_2up") gives the number of genes which are DE between \(t_6\) versus \(t_0\), \(t_7\) versus \(t_0\) and \(t_8\) versus \(t_0\) and up-regulated at only two times among \(t_6\), \(t_7\) and \(t_8\) compared to \(t_0\). \end{itemize} %%% The same graph is also given without colors with the R command \texttt{print(resDELeuk500\$DEplots\_TimePerGroup\$VennBarplot.Group\_P)}. %%%%%%% \item[Interpretation of the results] This graph identifies the most important temporal patterns for each biological condition and shows whether the genes in these temporal patterns are over- or under-expressed. In our case, the graph confirms that the two most important temporal patterns for biological condition P are: "00111111" and "00000011". Moreover, whatever the temporal pattern, almost all genes are either always over-expressed compared to \(t_0\) or always under-expressed compared to \(t_0\). \end{description} %%%%%%%%%%%%%%%%%%% \noindent \emph{One alluvial graph}, for DE genes which are DE at least at one time for each biological condition <>= print(resDELeuk500$DEplots_TimePerGroup$AlluvialGraph_DE.1tmin_perGroup) @ %%%%%%%%%%%%%%%%%%% \begin{description} \item[Description] The alluvial graph is quite simple here because there are only 2 biological conditions. It shows common and no common genes between which are DE at least at one time for the two biological conditions. %%%%%%% \item[Interpretation of the results] This graph shows that majority of DE genes are common between both biological condition. \end{description} %%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{Graphs from the results of the biological condition analysis} \label{sec:DEleukGRAPHSbiocond} %%---------------------------------------------------------------------------%% From the statistical analysis by biological condition, the function plots the following graphs. \noindent \emph{One barplot} showing the number of specific genes per biological condition, for each time. <>= print(resDELeuk500$DEplots_GroupPerTime$NumberSpecificGenes_UpDownRegulated_perBiologicalCondition) @ %%%%%%%%%%%%%%%%%%% \begin{description} \item[Description] A gene is called specific to a given biological condition BC1 at a time \(t_i\), if the gene is DE between BC1 and any other biological conditions at time \(t_i\), but not DE between any pair of other biological conditions at time \(t_i\). If the levels expression of a specific genes of the biological condition BC1 ii higher (resp. lower) in BC1 than the other biological conditions then the gene is categorized as specific up-regulated (resp. down-regulated). In the graph, the up-regulated (resp. down-regulated) genes are indicated in red (resp. in blue). As there are only two biological conditions here, the number of DE genes is equal to the number of specific genes and the number of specific up-regulated (resp. down-regulated) genes in condition P at any time \(t_i\) is equal to the number of specific down-regulated (resp. up-regulated) genes in condition NP at the same time \(t_i\). %%%%%%% \item[Interpretation of the results] We can see that the times when the number of specific DE genes is highest are \(t_3\), \(t_4\) and \(t_7\). This means that it is at these times that real differences appear between P and NP cells. What's also interesting is the high proportion of over-expressed genes in the P biological condition at most measurement times, implying that cell proliferation is more impacted by "stimulatory" biological pathways than by "inhibitory" ones. \end{description} %%%%%%%%%%%%%%%%%%% \noindent \emph{alluvial graph} showing the number of DE genes which are specific at least at one time for each group, plotted only if there are more than two biological conditions (which is not the case here). %%%%%%%%%%%%%%%%%%% %% \begin{description} %% \item[Description] The graph shows the number of DE genes which are specific %% at least at one time for each group. %%%%%%% %% \item[Interpretation of the results] The plots allows to detect %% genes that are specific only for one biological condition. %% \end{description} %%%%%%%%%%%%%%%%%%% \noindent \emph{\(\binom{N_{bc}}{2}=\binom{4}{2}=6\) upset graph} showing the number of genes corresponding to each possible intersection in a Venn barplot at a given time, plotted only if there are more than two biological conditions (which is not the case here). We recall that a set of pairs of biological conditions forms an intersection at a given time \(t_i\) when there is at least one gene which is DE for each of these pairs of biological conditions at time \(t_i\), but not for the others at time \(t_i\) %% \begin{itemize} %% \item a gene corresponds to a given intersection if this genes is DE for %% each pair of biological conditions in the intersection at time \(t_i\), %% but not for the others at time \(t_i\). %% \end{itemize} %%%%%%%%%%%%%%%%%%% %% \begin{description} %% \item[Description] The graph are realized with the R %% package UpSetR \cite{Conway2017UpSetR}, which corresponds to Venn diagram %% barplots for each time ti. Each graph plots the number of genes corresponding %% to each possible intersection in a Venn barplot at a given time. %% \end{itemize} %%%%%%% %% \item[Interpretation of the results] We can detect the most important %% intersections (of pairs of biological conditions) and thus identify %% which biological conditions have similar or distinct behaviors. %% \end{description} %%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{Graphs from the results of the combination of temporal and biological statistical analysis} \label{sec:DEleukGRAPHSboth} %%---------------------------------------------------------------------------%% From the combination of temporal and biological statistical analysis, the function plots the following graphs. \noindent \emph{One barplot} showing the number of signature genes and DE genes (but not signature) per time, for each biological condition. <>= print(resDELeuk500$DEplots_TimeAndGroup$Number_DEgenes_SignatureGenes_UpDownRegulated_perTimeperGroup) @ %%%%%%%%%%%%%%%%%%% \begin{description} \item[Description] A gene is called signature of a biological condition BC1 at a given time \(t_i\), if the gene is specific to the biological condition BC1 at time \(t_i\) and DE between \(t_i\) versus the reference time \(t_0\) for the biological condition BC1. For each DE gene, we compute the sign of the log2 fold change between time \(t_i\) and time \(t_0\). %%% \begin{itemize} \item If the sign is positive (resp. negative) for the biological condition P (resp. NP) and the gene is specific to P (resp. NP) then the gene is categorized as signature up-regulated (resp. down-regulated) to the biological condition P (resp. NP). In the graph, the up-regulated (resp. down-regulated) genes are indicated in red (resp. in blue). \item If the sign is positive (resp. negative) for the biological condition P (resp. NP) and the gene is not specific to P (resp. NP) then the gene is categorized as up-regulated (resp. down-regulated) to the biological condition P (resp. NP). In the graph, the up-regulated (resp. down-regulated) genes are indicated in orange (resp. in light blue). \end{itemize} %%%%%%% \item[Interpretation of the results] The times when the number of DE genes (here it is the same as specific) is highest are \(t_3\), \(t_4\) and \(t_7\). This means that it is at these times that real differences appear between P and NP cells. What's also interesting is the high proportion of over-expressed genes in the P biological condition at most measurement times, implying that cell proliferation is more impacted by "stimulatory" biological pathways than by "inhibitory" ones. \end{description} %%%%%%%%%%%%%%%%%%% \noindent \emph{One barplot} showing the number of genes which are DE at least at one time, specific at least at one time and signature at least at one time, for each biological condition. <>= print(resDELeuk500$DEplots_TimeAndGroup$Number_DEgenes1TimeMinimum_Specific1TimeMinimum_Signature1TimeMinimum_perBiologicalCondition) @ %%%%%%%%%%%%%%%%%%% \begin{description} \item[Interpretation of the results] The barplot summarizes well the combination of temporal and biological analysis because the figure gives for each biological condition the number of genes which are DE at least at one time, the number of specific genes at least at one time and the number of signature genes at least at one time. \end{description} %%%%%%%%%%%%%%%%%%% \noindent \emph{One alluvial graph} for DE genes which are signature at least at one time for each biological condition, only if there are more than two biological conditions (which is not the case here). %%%%%%%%%%%%%%%%%%% %% \begin{description} %% \item[Description] The plots allows to detect genes that are signature %% only for one biological condition. %% \end{description} %%%%%%%%%%%%%%%%%%% \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Volcano plots, ratio intensity (MA) plots and Heatmaps with DEplotVolcanoMA() and DEplotHeatmaps()} \label{sec:DEvolcanoMAheatmap_leuk} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{Volcano and MA plots (case 3)} \label{sec:DEvolcanoMALeuk} %%---------------------------------------------------------------------------%% The following lines of code allow to plot %%%%%%%%%%%%%%%%%%% \begin{itemize} \item \(\dbinom{N_{bc}}{2} \times T + (T-1)\times N_{bc} = \frac{N_{bc}(N_{bc}-1)}{2}\times T + (T-1)\times N_{bc}=25\) volcano plots (with \(N_{bc}=2\) the number of biological conditions and \(T=9\) the number of time points). \item \(\dbinom{N_{bc}}{2} \times T + (T-1)\times N_{bc}=25\) MA plots. \end{itemize} %%%%%%%%%%%%%%%%%%% allowing to separate non DE genes, DE genes below a threshold of log2 fold change and DE genes above a threshold of log2 fold change. <>= SEresVolcanoMAleuk <- DEplotVolcanoMA(SEresDE=SEresDELeuk500, NbGene.plotted=2, SizeLabel=3, Display.plots=FALSE, Save.plots=FALSE) @ If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item either a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% If the user wants to display the graph, he must set \texttt{Display.plots=TRUE}. If the user wants to see the results of the \textbf{DEplotVolcanoMA}, he must first execute the following lines of code. <>= ## Save 'Results' of the metadata in an object resleuk500 <- S4Vectors::metadata(SEresVolcanoMAleuk)$Results ## Save the results of DEplotVolcanoMA in an object resVolMAleuk500 <- resleuk500[[2]][[3]] ### names(S4Vectors::metadata(SEresVolcanoMAleuk)) str(resleuk500, max.level=3, give.attr=FALSE) names(resVolMAleuk500) @ \texttt{resVolMAleuk500\$Volcano} and \texttt{resVolMAleuk500\$MAplots} contain the volcano and MA plots %%%%%%%%%%%%%%%%%%% \begin{itemize} \item between each time \(t_i\) versus the reference time \(t_0\) for each biological condition, \item between each pair of biological condition for each time point, \end{itemize} %%%%%%%%%%%%%%%%%%% as can be seen with the following line of code <>= str(resVolMAleuk500$Volcano, max.level=2, give.attr=FALSE) @ and \texttt{resVolMAleuk500\$highest2DEgenes} contains the \texttt{NbGene.plotted} most important genes for each volcano and MA plot. If we call \(PV_g\) and \(FC_g\) respectively the p-value and log fold change of a gene \(g\) for a given volcano plot, then the most important genes for each volcano and MA plot are those which maximize \(PV_g^2 + FC_g^2\). The following lines plot the volcano plot for the biological condition P between \(t_2\) and \(t_0\) <>= print(resVolMAleuk500$Volcano$P$P_t2_vs_t0) @ and the following lines plot the MA plot for the biological condition P between \(t_2\) and \(t_0\). <>= print(resVolMAleuk500$MA$P$P_t2_vs_t0) @ %%%%%%%%%%%%%%%%%%% \begin{description} \item[Interpretation of the results of volcano plot] The most over-expressed (resp. under-expressed) genes are to the right (resp. left) of the volcano plot, and the most significant (resp. least significant) genes are at the top (resp. bottom) of the volcano plot. The volcano plot thus enables rapid visual identification of the "best" DE genes, which should have the lowest possible pvalue and the highest possible absolute log2 fold change. \item[Interpretation of the results of MA plot] Highly expressed genes are to the right of the graph, and genes at the top (resp. bottom) of the graph are the most over-expressed (resp. under-expressed). The "interesting" genes are therefore at the top right and bottom right. \end{description} %%%%%%%%%%%%%%%%%%% Write \texttt{?DEplotVolcanoMA} in your console for more information about the function. %%---------------------------------------------------------------------------%% \subsubsubsection{Heatmaps (case 3)} \label{sec:DEheatmapLeuk} %%---------------------------------------------------------------------------%% The following lines of code allow to plot a correlation heatmap between samples (correlation heatmap) and a heatmap across samples and genes called Zscore heatmap, for a subset of genes that can be selected by the user. The second heatmap is build from the normalized count data after being both centered and scaled (Zscore). The input \texttt{SEresDE} can be either %%%%%%%%%%% \begin{itemize} \item \texttt{SEresDE=SEresDELeuk500}, and the results of \texttt{DEplotHeatmaps()} will be added in the SE object which contains the results of \texttt{DEanalysisGlobal()} \item or \texttt{SEresDE=SEresVolcanoMAleuk} and the results of \texttt{DEplotHeatmaps()} will be added in the SE object which contains the results of \texttt{DEanalysisGlobal()} and \texttt{DEplotVolcanoMA()}. \end{itemize} %%%%%%%%%%% <>= SEresHeatmapLeuk <- DEplotHeatmaps(SEresDE=SEresVolcanoMAleuk, ColumnsCriteria=c(18, 19), Set.Operation="union", NbGene.analysis=20, SizeLabelRows=5, SizeLabelCols=5, Display.plots=FALSE, Save.plots=FALSE) @ For the Zscore heatmap, The subset of genes is selected as followss %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryLeuk} (see Section~\ref{sec:DEsummaryLeuk}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryLeuk} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts, normalized data and \texttt{DEsummaryLeuk}) included in the SE object \texttt{SEresDELeuk500} are those such that the sum of the selected columns of \texttt{DEsummaryLeuk} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts, normalized data and \texttt{DEsummaryLeuk}) included in the SE object \texttt{SEresDELeuk500} are those such that the product of the selected columns of \texttt{DEsummaryLeuk} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts, normalized data and \texttt{DEsummaryLeuk}) included in the SE object \texttt{SEresDELeuk500} are those such that only one element of the selected columns of \texttt{DEsummaryLeuk} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% If the user wants to display the graph, he must set \texttt{Display.plots=TRUE}. If the user wants to see the results of the \textbf{DEplotHeatmaps}, he must first execute the following lines of code. <>= ## Save 'Results' of the metadata in an object resleuk500 <- S4Vectors::metadata(SEresHeatmapLeuk)$Results ## Save the results of DEplotHeatmaps in an object resHeatmapLeuk <- resleuk500[[2]][[4]] ### names(S4Vectors::metadata(SEresHeatmapLeuk)) str(resleuk500, max.level=3, give.attr=FALSE) names(resHeatmapLeuk) @ %%%%%%%%%%%%%%%%%%% \begin{itemize} \item \texttt{resHeatmapLeuk\$CorrelationMatrix} contains the correlation matrix between samples %%%%%%% \item \texttt{resHeatmapLeuk\$Zscores} contains the matrix of scaled rle normalized data of the \texttt{NbGene.analysis} selected genes. %%%%%%% \item \texttt{resHeatmapLeuk\$Heatmap\_Correlation} contains the heatmap associated to the matrix \texttt{resHeatmapLeuk\$CorrelationMatrix} (\textit{correlation heatmap}). %%%%%%% \item \texttt{resHeatmapLeuk\$Heatmap\_Zscore} contains the heatmap associated to the matrix\newline \texttt{resHeatmapLeuk\$CorrelationMatrix} (\textit{Zscore heatmap}). \end{itemize} %%%%%%%%%%%%%%%%%%% The following lines of code plot the \textit{correlation heatmap} and the \textit{Zscore heatmap} <>= print(resHeatmapLeuk$Heatmap_Correlation) print(resHeatmapLeuk$Heatmap_Zscore) @ Write \texttt{?DEplotHeatmaps} in your console for more information about the function. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Gene Ontology (GO) analysis with GSEAQuickAnalysis() and GSEApreprocessing()} \label{sec:GOenrichmentLeuk} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsection{Gene ontology with the R package gprofiler2} \label{sec:GOenrichmentLeukGprofiler2} %%---------------------------------------------------------------------------%% The lines of code below realize an enrichment analysis with the R package gprofiler2 for a selection of genes. Beware, an internet connection is needed. The function returns %%%%%%%%%%%%%%%%% \begin{itemize} \item a data.frame (output \texttt{metadata(SEresgprofiler2Leuk)\$Rgprofiler2\$GSEAresults}) giving information about all detected gene ontologies for the list of associated genes. \item a lollipop graph (see section \nameref{sec:stepsGO}). The y-axis indicates the \texttt{MaxNumberGO} most significant gene ontologies and pathways associated to the selected genes. The gene ontologies and patways are sorted into descending order. The x-axis indicates the \(-log10(pvalues)\). The higher is a lollipop the more significant is a gene ontology or pathway. A lollipop is yellow if the pvalues is smaller than 0.05 (significant) and blue otherwise. \item A Manhattan plot (see section \nameref{sec:stepsGO}) indicating all genes ontologies ordered according to the functional database (G0::BP, G0::CC, G0::MF and KEGG) \end{itemize} %%%%%%%%%%%%%%%%% The input \texttt{SEresDE} can be either %%%%%%%%%%% \begin{itemize} \item \texttt{SEresDE=SEresDELeuk500}, and the results of \texttt{GSEAQuickAnalysis()} will be added in the SE object which contains the results of \texttt{DEanalysisGlobal()} \item either \texttt{SEresDE=SEresVolcanoMAleuk} and the results of \texttt{GSEAQuickAnalysis()} will be added in the SE object which contains the results of \texttt{DEanalysisGlobal()} and \texttt{DEplotVolcanoMA()}. \item or \texttt{SEresDE=SEresHeatmapLeuk} and the results of \texttt{GSEAQuickAnalysis()} will be added in the SE object which contains the results of \texttt{DEanalysisGlobal()}, \texttt{DEplotVolcanoMA()} and \texttt{DEplotHeatmaps()}. \end{itemize} %%%%%%%%%%% <>= SEresgprofiler2Leuk <- GSEAQuickAnalysis(Internet.Connection=FALSE, SEresDE=SEresHeatmapLeuk, ColumnsCriteria=c(18), ColumnsLog2ordered=NULL, Set.Operation="union", Organism="hsapiens", MaxNumberGO=20, Background=FALSE, Display.plots=FALSE, Save.plots=FALSE) ## ## head(SEresgprofiler2Leuk$GSEAresults) @ As \textbf{GSEAQuickAnalysis()} requires an internet connection, we needed to add the input \texttt{Internet.Connection} in order to be sure to pass the tests realized on our package by Bioconductor. The input \texttt{Internet.Connection} is set by default to \texttt{FALSE} and as long as \texttt{Internet.Connection=FALSE}, no enrichment analysis will be done. Once the user is sure to have an internet connection, the user may set \texttt{Internet.Connection=TRUE} in order to realize the enrichment analysis. The subset of genes is selected as follows %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryLeuk} (see Section~\ref{sec:DEsummaryLeuk}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryLeuk} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts, normalized data and \texttt{DEsummaryLeuk}) included in the SE object \texttt{SEresDELeuk500} are those such that the sum of the selected columns of \texttt{DEsummaryLeuk} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts, normalized data and \texttt{DEsummaryLeuk}) included in the SE object \texttt{SEresDELeuk500} are those such that the product of the selected columns of \texttt{DEsummaryLeuk} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts, normalized data and \texttt{DEsummaryLeuk}) included in the SE object \texttt{SEresDELeuk500} are those such that only one element of the selected columns of \texttt{DEsummaryLeuk} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If \texttt{ColumnsLog2ordered} is a vector of integers, the enrichment analysis will take into account the genes order as the first genes will be considered to have the highest biological importance and the last genes the lowest. It corresponds to the columns number of \texttt{DEsummaryLeuk} (the output of \Rfunction{DEanalysisGlobal()}) which must contains \(log_2\) fold change values. The rows of \texttt{DEsummaryLeuk} (corresponding to genes) will be decreasingly ordered according to the sum of absolute \(log_2\) fold change (the selected columns must contain \(log_2\) fold change values) before the enrichment analysis. If \texttt{ColumnsLog2ordered=NULL}, then the enrichment analysis will not take into account the genes order. If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% \texttt{SEresgprofiler2Leuk} contains the results of enrichment analysis where selected genes are those DE at time \(t_7\) for the biological condition P. To retrieve the results, the user must first execute\newline \texttt{resgprofiler2Leuk <- S4Vectors::metadata(SEresgprofiler2Leuk)\$Results[[2]][[5]]} %%%%%%%%%%%%%%%%%%% \begin{itemize} \item \texttt{resgprofiler2Leuk\$GSEAresults} contains a matrix which gives the results of the GSEA analysis: GO terms, GO names, p-value, GOparents, genes associated to each GO ... %%%%%%% \item \texttt{resgprofiler2Leuk\$selectedGenes} contains the list of selected genes %%%%%%% \item \texttt{resgprofiler2Leuk\$lollipopChart} contains the lollipop graph of the GSEA analysis (see Figure~\ref{fig:FissionLollipop}) %%%%%%% \item \texttt{resgprofiler2Leuk\$manhattanPlot} contains the Manhattan graph of the GSEA analysis (see Figure~\ref{fig:FissionManhattan}). \end{itemize} %%%%%%%%%%%%%%%%%%% As expected in Section~\ref{sec:DEleukGRAPHStemporel}, DE genes at time \(t_7\) for the biological condition P are linked to the cellular proliferation. Write \texttt{?GSEAQuickAnalysis} in your console for more information about the function. \clearpage %%---------------------------------------------------------------------------%% \subsubsection{Preprocessing for GSEA, DAVID, WebGestalt, gProfiler, Panther, ShinyGO, Enrichr and GOrilla} \label{sec:GOenrichmentLeukpreprocessing} %%---------------------------------------------------------------------------%% The following lines of code will generate all files, for a selection of genes, in order to use the following software and online tools : GSEA, DAVID, WebGestalt, gProfiler, Panther, ShinyGO, Enrichr and GOrilla. <>= SEresPreprocessingLeuk <- GSEApreprocessing(SEresDE=SEresDELeuk500, ColumnsCriteria=c(18, 19), Set.Operation="union", Rnk.files=FALSE, Save.files=FALSE) @ The subset of genes is selected as follows %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryLeuk} (see Section~\ref{sec:DEsummaryLeuk}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryLeuk} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts, normalized data and \texttt{DEsummaryLeuk}) included in the SE object \texttt{SEresDELeuk500} are those such that the sum of the selected columns of \texttt{DEsummaryLeuk} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts, normalized data and \texttt{DEsummaryLeuk}) included in the SE object \texttt{SEresDELeuk500} are those such that the product of the selected columns of \texttt{DEsummaryLeuk} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts, normalized data and \texttt{DEsummaryLeuk}) included in the SE object \texttt{SEresDELeuk500} are those such that only one element of the selected columns of \texttt{DEsummaryLeuk} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If the user wants to save the files, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item either a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?GSEApreprocessing} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \section{Quick description of the analysis of a dataset with several biological conditions (case 1)} \label{sec:Analysis_case1} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In this section we use the mouse subdataset \textbf{RawCounts\_Antoszewski2022\_MOUSEsub500} (see the subsection \ref{sec:dataAntoszewski2022}) in order to explain the use of our package in \textbf{Case 1} (several biological conditions and a single time).\newline Most of the outputs in \textbf{Case 1} are of a similar form as those shown in \textbf{Case 3} (Section~\ref{sec:DetailedAnalysis}), except for some outputs whose list is precisely given below %%% \begin{itemize} \item Subsection~\ref{sec:PlotExpressionMus1} page \pageref{sec:PlotExpressionMus1} \item Subsection~\ref{sec:DEmus1GRAPHS} page \pageref{sec:DEmus1GRAPHS}. \end{itemize} %%% Furthermore, as there is only one time point, no Mfuzz analysis is realized. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Preprocessing step with DATAprepSE()} \label{sec:DATAprepSEmus1}%%\label{sec:PreprocessingStep} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The preprocessing step is mandatory and is realized by our R function \textbf{DATAprepSE()} to store all information about the dataset in a standardized way (SummarizedExperiment class object, see Section~\ref{sec:StructureOutput}). It can be done using the following lines of code. %% The user must realize this step in order to realize %% exploratory data analysis (unsupervised analysis, %% section \ref{sec:ExploratoryAnalysis_mus1}) or %% statistical analysis of the transcriptional response %% (supervised analysis, section \ref{sec:SupervisedAnalysis_Mus1}). <>= SEresPrepMus1 <- DATAprepSE(RawCounts=RawCounts_Antoszewski2022_MOUSEsub500, Column.gene=1, Group.position=1, Time.position=NULL, Individual.position=2) @ The function returns \begin{itemize} \item A \Rclass{SummarizedExperiment} class object containing all information of the dataset to be used for exploratory data analysis \item A \Rclass{DESeqDataSet} class object to be used for statistical analysis of the transcriptional response. \end{itemize} Write \texttt{?DATAprepSE} in your console for more information about the function. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Exploratory data analysis (unsupervised analysis)} \label{sec:ExploratoryAnalysis_mus1} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Normalization with DATAnormalization()} \label{sec:DATAnormalizationMus1} %%\label{sec:DATAnormalization} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The following lines of code realize the normalization step from the results of the function \textbf{DATAprepSE()} (subsection \ref{sec:DATAprepSEmus1}) <>= SEresNormMus1 <- DATAnormalization(SEres=SEresPrepMus1, Normalization="rle", Blind.rlog.vst=FALSE, Plot.Boxplot=TRUE, Colored.By.Factors=TRUE, Color.Group=NULL, path.result=NULL) @ If \Rcode{Plot.Boxplot=TRUE} a boxplot showing the distribution of the normalized expression (\texttt{Normalization="rle"} means that the rle method is used) of genes for each sample is returned. If \texttt{Colored.By.Factors=TRUE}, the color of the boxplots will be different for different biological conditions. By default (if \texttt{Color.Group=NULL}), a color will be automatically assigned for each biological condition. Colors can be changed by creating the following data.frame <>= colMus1 <- data.frame(Name=c("N1wtT1wt", "N1haT1wt", "N1haT1ko", "N1wtT1ko"), Col=c("black", "red", "green", "blue")) @ and setting \texttt{Color.Group=colMus1}. The x-labels give biological information and individual information separated by dots. If the user wants to see the 6th first rows of the normalized data, he can write in his console \texttt{head(res.Norm.Mus500\$NormalizedData, n=6)}. The user can save the graph in a folder thanks to the input path.result. If \texttt{path.result=NULL} the results will still be plotted but not saved in a folder. Write \texttt{?DATAnormalization} in your console for more information about the function. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Factorial analysis: PCA with PCAanalysis() and clustering with HCPCanalysis()} \label{sec:FactorialMus1} %% \label{sec:Factorial} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{PCA (case 1)}\label{sec:PCAMus1} %%---------------------------------------------------------------------------%% When samples belong only to different biological conditions, the lines of code below return from the results of the function \textbf{DATAnormalization()} %%%%%%%%%%%%% \begin{itemize} \item the results of the R function \Rfunction{PCA()} from the package FactoMineR. \item one 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}) where samples are colored with different colors for different biological conditions. \end{itemize} %%%%%%%%%%%% <>= SEresPCAMus1 <- PCAanalysis(SEresNorm=SEresNormMus1, sample.deletion=NULL, gene.deletion=NULL, Plot.PCA=TRUE, Mean.Accross.Time=FALSE, Color.Group=NULL, Cex.label=0.8, Cex.point=0.7, epsilon=0.2, Phi=25,Theta=140, motion3D=FALSE, path.result=NULL) @ The graphs are \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.PCA=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} By default (if \texttt{Color.Group=NULL}), a color will be automatically assigned to each biological condition. The user can change the colors by creating the following data.frame <>= colMus1 <- data.frame(Name=c("N1wtT1wt", "N1haT1wt", "N1haT1ko", "N1wtT1ko"), Col=c("black", "red", "green", "blue")) colMus1 @ and setting \texttt{Color.Group=colMus1}. If the user wants to delete, for instance, the genes 'ENSMUSG00000064842' and 'ENSMUSG00000051951' (respectively the second and forth gene) and/or delete the samples 'N1wtT1wt\_r2' and 'N1haT1wt\_r5', he can set \begin{itemize} \item \texttt{gene.deletion=c("ENSMUSG00000064842","ENSMUSG00000051951")} and/or\newline \texttt{sample.deletion=c("N1wtT1wt\_r2","N1haT1wt\_r5")} \item \texttt{gene.deletion=c(2,4)} and/or \texttt{sample.deletion=c(3,6)}.\newline The integers in \texttt{gene.deletion} and \texttt{sample.deletion} represent respectively the row numbers and the column numbers of \texttt{RawCounts} where the selected genes and samples are located. \end{itemize} Write \texttt{?PCAanalysis} in your console for more information about the function. \clearpage %%---------------------------------------------------------------------------%% \subsubsubsection{HCPC (case 1)}\label{sec:HCPCMus1} %%---------------------------------------------------------------------------%% The user can realize the clustering with HCPC using the function \textbf{HCPCanalysis()} as below. The following lines of code return from the results of the function \textbf{DATAnormalization()} %%%%% \begin{itemize} \item The results of the R function \Rfunction{HCPC()} from the package FactoMineR. \item A dendrogram \item A graph indicating for each sample, its cluster and the biological condition associated to the sample, using a color code \item One 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}). These PCA graphs are identical to the outputs of \textbf{PCAanalysis()} but samples are colored with different colors for different clusters. \end{itemize} <>= SEresHCPCMus1 <- HCPCanalysis(SEresNorm=SEresNormMus1, gene.deletion=NULL, sample.deletion=NULL, Plot.HCPC=FALSE, Cex.label=0.8, Cex.point=0.7, epsilon=0.2, Phi=25, Theta=140, motion3D=FALSE, path.result=NULL) @ The graphs are \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.HCPC=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %% The optimal number of clusters is automatically computed by the R function %% \Rfunction{HCPC()}. Write \texttt{?HCPCanalysis} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Genes expression profile with DATAplotExpressionGenes()} \label{sec:PlotExpressionMus1} %% \label{sec:PlotExpression} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The lines of code below allow to plot, from the results of the function \textbf{DATAnormalization()}, the expression profile of the 10th gene showing for each biological condition: a box plot and error bars (standard deviation). Each box plot, violin plot and error bars is associated to the distribution of the expression of the 10th genes in all samples belonging to the corresponding biological condition. <>= resEVOmus1500 <- DATAplotExpressionGenes(SEresNorm=SEresNormMus1, Vector.row.gene=c(10), Color.Group=NULL, Plot.Expression=TRUE, path.result=NULL) @ If the user wants to select several genes , for instance the 28th, the 38th, the 39th and the 50th, he needs to set \texttt{Vector.row.gene=c(28,38:39,50)}. The graphs are \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.Expression=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} By default (if \texttt{Color.Group=NULL}), a color will be automatically assigned to each biological condition. The user can change the colors by creating the following data.frame <>= colMus1 <- data.frame(Name=c("N1wtT1wt", "N1haT1wt", "N1haT1ko", "N1wtT1ko"), Col=c("black", "red", "green", "blue")) @ and setting \texttt{Color.Group=colMus1}. Write \texttt{?DATAplotExpressionGenes} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Statistical analysis of the transcriptional response (supervised analysis)} \label{sec:SupervisedAnalysis_Mus1} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{DE analysis with DEanalysisGlobal()} \label{sec:DEanalysisMus1} %% \label{sec:DEanalysis} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The function below %%%%%%%%%%% \begin{itemize} \item returns a data.frame. See subection \nameref{sec:DEsummaryMus1}. \item plots the following graphs %%%%%%% \begin{itemize} \item an upset graph, realized with the R package UpSetR \cite{Conway2017UpSetR}, which corresponds to a Venn diagram barplot. If there are only two biological conditions, the graph will not be plotted. See subsection \nameref{sec:DEmus1GRAPHS} \item a barplot showing the number of specific genes per biological condition. See subection \nameref{sec:DEmus1GRAPHS}. \end{itemize} %%%%%%% \end{itemize} %%%%%%%%%%% Due to time consuming of the DE analysis, we stored in the object \texttt{Results\_DEanalysis\_sub500} (uncommented lines) a list of three objects %%%%%%%%%%% \begin{itemize} \item \texttt{Results\_DEanalysis\_sub500\$DE\_Schleiss2021\_CLLsub500}, stored the results of \Rfunction{DEanalysisGlobal()} with \texttt{RawCounts\_Schleiss2021\_CLLsub500}. \item \texttt{Results\_DEanalysis\_sub500\$DE\_Antoszewski2022\_MOUSEsub500}, stored the results of \Rfunction{DEanalysisGlobal()} with \texttt{RawCounts\_Antoszewski2022\_MOUSEsub500}. \item \texttt{Results\_DEanalysis\_sub500\$DE\_Leong2014\_FISSIONsub500wt}, stored the results of \Rfunction{DEanalysisGlobal()} with \texttt{RawCounts\_Leong2014\_FISSIONsub500wt}. \end{itemize} %%%%%%%%%%% <>= SEresDEMus1 <- DEanalysisGlobal(SEres=SEresPrepMus1, pval.min=0.05, pval.vect.t=NULL, log.FC.min=1, LRT.supp.info=FALSE, Plot.DE.graph=FALSE, path.result=NULL, Name.folder.DE=NULL) ## data("Results_DEanalysis_sub500") ## SEresDEMus1 <- Results_DEanalysis_sub500$DE_Antoszewski2022_MOUSEsub500 @ The graphs are %%%%%%%%%%% \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.DE.graph=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%% Write \texttt{?DEanalysisGlobal} in your console for more information about the function. %%---------------------------------------------------------------------------%% \subsubsubsection{Data.frame summarising all the DE analysis (case 1)} \label{sec:DEsummaryMus1} %%---------------------------------------------------------------------------%% The output data.frame can be extracted with the following line of code, <>= DEsummaryMus1 <- SummarizedExperiment::rowData(SEresDEMus1) @ As we use abbreviated column names, we propose a glossary in order to help the user to understand meaning of each column. The glossary of the column names can be extracted with the following lines of code, <>= resDEMus1 <- S4Vectors::metadata(SEresDEMus1)$Results[[2]][[2]] resGlossaryMus1 <- resDEMus1$Glossary @ and then write \texttt{DEsummaryMus1} and \texttt{resGlossaryMus1} in the R console. The data.frame contains %%%%%%%%%%% \begin{itemize} \item gene names (column 1) \item pvalues, log2 fold change and DE genes between each pairs of biological conditions (for a total of \( 3 \times \frac{N_{bc} \times (N_{bc}-1)}{2}= 3 \times 6 = 18\) columns). \item a binary column (1 and 0) where 1 means that the gene is DE between at least one pair of biological conditions. \item \(N_{bc}=4\) binary columns (with \(N_{bc}\) the number of biological conditions), which give the specific genes for each biological condition. A '1' in one of these columns means that the gene is specific to the biological condition associated to the column. 0 otherwise. A gene is called specific to a given biological condition BC1, if the gene is DE between BC1 and any other biological conditions, but not DE between any pair of other biological conditions. \item \(N_{bc}=4\) columns filled with -1, 0 and 1, one per biological condition. A '1' in one of these columns means that the gene is up-regulated (or over-expressed) for the biological condition associated to the column. A gene is called up-regulated for a given biological condition BC1 if the gene is specific to the biological condition BC1 and expressions in BC1 are higher than in the other biological conditions. A '-1' in one of these columns means the gene is down-regulated (or under-expressed) for the biological condition associated to the column. A gene is called down-regulated for a given biological condition BC1 if the gene is specific to the biological condition BC1 and expressions in BC1 are lower than in the other biological conditions. A '0' in one of these columns means that the gene is not specific to the biological condition associated to the column. \end{itemize} %%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{Description of graphs}\label{sec:DEmus1GRAPHS} %%---------------------------------------------------------------------------%% The user can plot three graphs. \noindent \emph{One upset graph.} The graph plots the number of genes corresponding to each possible intersection in a Venn barplot. We say that %%%%%%% \begin{itemize} \item a set of pairs of biological conditions forms an intersection when there is at least one gene which is DE for each of these pairs of biological conditions, but not for the others. \item a gene corresponds to a given intersection if this genes is DE for each pair of biological conditions in the intersection, but not for the others. \end{itemize} %%%%%%% The following line of code plots the Venn barplot <>= print(resDEMus1$VennBarplot) @ The second column of Venn barplot the gives the number of genes corresponding to the intersection \[\biggl\{\{\mbox{N1wtT1wt, N1haT1wt}\}, \{\mbox{N1wtT1wt, N1haT1ko}\}, \{\mbox{N1wtT1wt, N1wtT1ko}\}\biggl\} \ .\] In other words, this is the number of genes DE between \begin{itemize} \item N1wtT1wt versus N1haT1wt \item N1wtT1wt versus N1haT1ko \item N1wtT1wt versus N1wtT1ko \end{itemize} and not DE between any other pairs of biological conditions. Note that this columns actually gives the number of specific genes to the biological condition N1wtT1wt. \noindent \emph{One barplot.} The graph plots for each biological condition BC1 %%%%%%%% \begin{itemize} \item The number of up- and down-regulated genes which are specific to the biological condition BC1. \item The number of genes categorized as ``Other.'' A gene belongs to the ``Other'' category if it is DE between BC1 and at least one other biological condition and not specific. \end{itemize} %%%%%%%% The following line of code plots the barplot <>= print(resDEMus1$NumberDEgenes_SpecificAndNoSpecific) @ \noindent \emph{One barplot.} The second barplot is the same graph without the ``Other'' category will be plotted too. If there are only two biological condition, only the second barplot will be plotted. The following line of code plots the second barplot <>= print(resDEMus1$NumberDEgenes_SpecificGenes) @ \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Volcano plots, ratio intensity (MA) plots and Heatmaps with DEplotVolcanoMA() and DEplotHeatmaps()} \label{sec:DEvolcanoMAheatmapMus1} %% \label{sec:DEvolcanoMAheatmap} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{Volcano and MA plots (case 1)}\label{sec:DEvolcanoMAMus1} %%---------------------------------------------------------------------------%% The following lines of code allow to plot \begin{itemize} \item \(\dbinom{N_{bc}}{2}=\frac{N_{bc}(N_{bc}-1)}{2}=6\) volcano plots (with \(N_{bc}=4\) the number of biological conditions). \item \(\frac{N_{bc}(N_{bc}-1)}{2}=6\) MA plots. \end{itemize} allowing to separate non DE genes, DE genes below a threshold of log2 fold change and DE genes above a threshold of log2 fold change (see Section~\ref{sec:DEvolcanoMALeuk} for more details). <>= SEresVolcanoMAMus1 <- DEplotVolcanoMA(SEresDE=SEresDEMus1, NbGene.plotted=2, SizeLabel=3, Display.plots=FALSE, Save.plots=FALSE) @ If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item or a string of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% If the user wants to display the graph, he must set \texttt{Display.plots=TRUE}. Write \texttt{?DEplotVolcanoMA} in your console for more information about the function. %%---------------------------------------------------------------------------%% \subsubsubsection{Heatmaps (case 1)}\label{sec:DEheatmapMus1} %%---------------------------------------------------------------------------%% The following lines of code allow to plot a correlation heatmap between samples (correlation heatmap) and a heatmap across samples and genes called Zscore heatmap, for a subset of genes that can be selected by the user. The second heatmap is build from the normalized count data after being both centered and scaled (Zscore). <>= SEresVolcanoMAMus1 <- DEplotHeatmaps(SEresDE=SEresDEMus1, ColumnsCriteria=2,#c(2,4), Set.Operation="union", NbGene.analysis=20, SizeLabelRows=5, SizeLabelCols=5, Display.plots=FALSE, Save.plots=FALSE) @ For the Zscore heatmap, The subset of genes is selected as followss %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryMus1} (see Section~\ref{sec:DEsummaryMus1}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryMus1} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus1}) included in the SE object \texttt{SEresDEMus1} are those such that the sum of the selected columns of \texttt{DEsummaryMus1} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus1}) included in the SE object \texttt{SEresDEMus1} are those such that the product of the selected columns of \texttt{DEsummaryMus1} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus1}) included in the SE object \texttt{SEresDEMus1} are those such that only one element of the selected columns of \texttt{DEsummaryMus1} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} If the user wants to display the graph, he must set \texttt{Display.plots=TRUE}. Write \texttt{?DEplotHeatmaps} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Gene Ontology (GO) analysis with GSEAQuickAnalysis() and GSEApreprocessing()} \label{sec:GOenrichmentMus1} %% \label{sec:GOenrichment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsection{Gene ontology with the R package gprofiler2} \label{sec:GOenrichmentMus1Gprofiler2} %%---------------------------------------------------------------------------%% The lines of code below realize an enrichment analysis with the R package gprofiler2 for a selection of genes. Beware, an internet connection is needed. The function returns %%%%%%%%%%%%%%%%% \begin{itemize} \item a data.frame (output \texttt{metadata(SEresgprofiler2Mus1)\$Rgprofiler2\$GSEAresults}) giving information about all detected gene ontologies for the list of associated genes. \item a lollipop graph (see section \nameref{sec:stepsGO}). The y-axis indicates the \texttt{MaxNumberGO} most significant gene ontologies and pathways associated to the selected genes. The gene ontologies and patways are sorted into descending order. The x-axis indicates the \(-log10(pvalues)\). The higher is a lollipop the more significant is a gene ontology or pathway. A lollipop is yellow if the pvalues is smaller than 0.05 (significant) and blue otherwise. \item A Manhattan plot (see section \nameref{sec:stepsGO}) indicating all genes ontologies ordered according to the functional database (G0::BP, G0::CC, G0::MF and KEGG) \end{itemize} %%%%%%%%%%%%%%%%% <>= SEresgprofiler2Mus1 <- GSEAQuickAnalysis(Internet.Connection=FALSE, SEresDE=SEresDEMus1, ColumnsCriteria=2, ColumnsLog2ordered=NULL, Set.Operation="union", Organism="mmusculus", MaxNumberGO=10, Background=FALSE, Display.plots=FALSE, Save.plots=FALSE) ## ## head(4Vectors::metadata(SEresgprofiler2Mus1)$Rgprofiler2$GSEAresults) @ As \textbf{GSEAQuickAnalysis()} requires an internet connection, we needed to add the input \texttt{Internet.Connection} in order to be sure to pass the tests realized on our package by Bioconductor. The input \texttt{Internet.Connection} is set by default to \texttt{FALSE} and as long as \texttt{Internet.Connection=FALSE}, no enrichment analysis will be done. Once the user is sure to have an internet connection, the user may set \texttt{Internet.Connection=TRUE} in order to realize the enrichment analysis. The subset of genes is selected as follows %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryMus1} (see Section~\ref{sec:DEsummaryMus1}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryMus1} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus1}) included in the SE object \texttt{SEresDEMus1} are those such that the sum of the selected columns of \texttt{DEsummaryMus1} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus1}) included in the SE object \texttt{SEresDEMus1} are those such that the product of the selected columns of \texttt{DEsummaryMus1} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus1}) included in the SE object \texttt{SEresDEMus1} are those such that only one element of the selected columns of \texttt{DEsummaryMus1} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If \texttt{ColumnsLog2ordered} is a vector of integers, the enrichment analysis will take into account the genes order as the first genes will be considered to have the highest biological importance and the last genes the lowest. It corresponds to the columns number of \texttt{DEsummaryMus1}, the output of \Rfunction{DEanalysisGlobal()}, which must contains \(log_2\) fold change values. The rows of \texttt{DEsummaryMus1} (corresponding to genes) will be decreasingly ordered according to the sum of absolute \(log_2\) fold change (the selected columns must contain \(log_2\) fold change values) before the enrichment analysis. If \texttt{ColumnsLog2ordered=NULL}, then the enrichment analysis will not take into account the genes order. If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?GSEAQuickAnalysis} in your console for more information about the function. \clearpage %%---------------------------------------------------------------------------%% \subsubsection{Preprocessing for GSEA, DAVID, WebGestalt, gProfiler, Panther, ShinyGO, Enrichr and GOrilla} \label{sec:GOenrichmentMus1preprocessing} %%---------------------------------------------------------------------------%% The following lines of code will generate all files, for a selection of genes, in order to use the following software and online tools : GSEA, DAVID, WebGestalt, gProfiler, Panther, ShinyGO, Enrichr and GOrilla. <>= SEresPreprocessingMus1 <- GSEApreprocessing(SEresDE=SEresDEMus1, ColumnsCriteria=2, Set.Operation="union", Rnk.files=FALSE, Save.files=FALSE) @ The subset of genes is selected as follows %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column with the input \texttt{ColumnsCriteria} which corresponds to the column number of \texttt{rowData(SEresDEMus1)} \item Then the subset of genes will be %%%%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets included in \texttt{SEresDEMus1} are those such that the sum of the selected columns of \texttt{SummarizedExperiment::rowData(SEresDEMus1)} by \texttt{ColumnsCriteria} is >0. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets included in \texttt{SEresDEMus1} are those such that the product of the selected columns of \texttt{SummarizedExperiment::rowData(SEresDEMus1)} by \texttt{ColumnsCriteria} is >0. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets included in \texttt{SEresDEMus1} are those such that only one element of the selected columns of \texttt{SummarizedExperiment::rowData(SEresDEMus1)} by \texttt{ColumnsCriteria} is >0. \end{itemize} %%%%%%%%% \end{enumerate} %%%%%%%%%%%%%%%%%%% If the user wants to save the files, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?GSEApreprocessing} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \section{Quick description of the analysis of a dataset with several time points (case 2)} \label{sec:Analysis_case2} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In this section we use the fission yeast subdataset \textbf{RawCounts\_Leong2014\_FISSIONsub500wt} (see the subsection \nameref{sec:dataLeong2014}) in order to explain the use of our package in \textbf{Case 2} (several times point and a single biological condiion).\newline Most of the outputs in \textbf{Case 2} are of a similar form as those shown in \textbf{Case 3} (Section~\ref{sec:DetailedAnalysis}), except for the output described in Subsection~\ref{sec:PlotExpressionFission} page \pageref{sec:PlotExpressionFission} which can be considered as slightly different. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Preprocessing step with DATAprepSE()} \label{sec:DATAprepSEfission} %%\label{sec:PreprocessingStep} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The preprocessing step is mandatory and is realized by our R function \textbf{DATAprepSE()} to store all information about the dataset in a standardized way (SummarizedExperiment class object, see Section~\ref{sec:StructureOutput}). It can be done using the following lines of code. <>= SEresPrepFission <- DATAprepSE(RawCounts=RawCounts_Leong2014_FISSIONsub500wt, Column.gene=1, Group.position=NULL, Time.position=2, Individual.position=3) @ The function returns %%%%%%%%%%%%%%%%%%% \begin{itemize} \item A \Rclass{SummarizedExperiment} class object containing all information of the dataset to be used for exploratory data analysis \item A \Rclass{DESeqDataSet} class object to be used for statistical analysis of the transcriptional response. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?DATAprepSE} in your console for more information about the function. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Exploratory data analysis (unsupervised analysis)} \label{sec:ExploratoryAnalysis_Fission} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Normalization with DATAnormalization()} \label{sec:DATAnormalizationFission} %% \label{sec:DATAnormalization} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The following lines of code realize the normalization step from the results of the function \textbf{DATAprepSE()} (subsection \ref{sec:DATAprepSEfission}). <>= SEresNormYeast <- DATAnormalization(SEres=SEresPrepFission, Normalization="rlog", Blind.rlog.vst=FALSE, Plot.Boxplot=FALSE, Colored.By.Factors=TRUE, Color.Group=NULL, Plot.genes=FALSE, path.result=NULL) @ If \texttt{Plot.Boxplot=TRUE} a boxplot showing the distribution of the normalized expression\newline (\texttt{Normalization="rlog"} means that the rlog method is used) of genes for each sample is returned. If \texttt{Colored.By.Factors=TRUE}, the color of the boxplots would be different for different time points. In case 2, the option \texttt{Color.Group} is not used. The x-labels indicate time information and individual information separated by a dot. If the user wants to see the 10 first rows of the normalized data, he can write in his console \texttt{head(SEresNormYeast\$NormalizedData, n=10)}. The user chooses the path of the folder where the graph can be saved. If \texttt{path.result=NULL}, results are plotted but not saved. Write \texttt{?DATAnormalization} in your console for more information about the function. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Factorial analysis: PCA with PCAanalysis() and clustering with HCPCanalysis()} \label{sec:FactorialFission} %% \label{sec:Factorial} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{PCA (case 2)}\label{sec:PCAFission} %%---------------------------------------------------------------------------%% When samples belong only to different times of measure, the lines of code below return from the results of the function \textbf{DATAnormalization()} %%%%%%%%%%%%%%%%%%% \begin{itemize} \item the results of the function \Rfunction{PCA()} \item one 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}) where samples are colored with different colors for different time points. Furthermore, lines are drawn in gray between each pair of consecutive points for each individual (if \texttt{Mean.Accross.Time=FALSE}, otherwise lines will be drawn only between mean values of all individuals for each time point). \item the same graphs as above but without lines (not shown). \end{itemize} %%%%%%%%%%%%%%%%%%% <>= SEresPCAyeast <- PCAanalysis(SEresNorm=SEresNormYeast, gene.deletion=NULL, sample.deletion=NULL, Plot.PCA=FALSE, Mean.Accross.Time=FALSE, Cex.label=0.8, Cex.point=0.7, epsilon=0.3, Phi=25,Theta=140, motion3D=FALSE, path.result=NULL) @ The graphs are \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.PCA=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} The user cannot select a color for each time. A specific palette is automatically used. These figures show that the temporal behavior is similar between individuals. If the user wants for instance, to perform the PCA analysis without the genes 'SPAC212.11' and 'SPNCRNA.70' (first and third gene) and/or without the samples 'wt\_t0\_r2' and 'wt\_t1\_r1', he can set %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{gene.deletion=c("SPAC212.11","SPNCRNA.70")} and/or\newline \texttt{sample.deletion=c("wt\_t0\_r2","wt\_t1\_r1")}, \item or \texttt{gene.deletion=c(1,3)} and/or \texttt{sample.deletion=c(3,5)}.\newline The integers in \texttt{gene.deletion} and \texttt{sample.deletion} represent respectively the row numbers (resp. the column numbers) of \texttt{RawCounts} corresponding to genes (resp. samples) that need to be removed from \texttt{RawCounts}. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?PCAanalysis} in your console for more information about the function. %%---------------------------------------------------------------------------%% \subsubsubsection{HCPC (case 2)}\label{sec:HCPCFission} %%---------------------------------------------------------------------------%% The user can realize the clustering with HCPC using the function \textbf{HCPCanalysis()} as below. The following lines of code return from the results of the function \textbf{DATAnormalization()} %%%%% \begin{itemize} \item The results of the R function \Rfunction{HCPC()} from the package FactoMineR. \item A dendrogram \item A graph indicating for each sample, its cluster and the biological condition associated to the sample, using a color code \item One 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}). These PCA graphs are identical to the outputs of \textbf{PCAanalysis()} but samples are colored with different colors for different clusters. \end{itemize} <>= SEresHCPCyeast <- HCPCanalysis(SEresNorm=SEresNormYeast, gene.deletion=NULL, sample.deletion=NULL, Plot.HCPC=FALSE, Cex.label=0.9, Cex.point=0.7, epsilon=0.2, Phi=25,Theta=140, motion3D=FALSE, path.result=NULL) @ The graphs are %%%%%%%%%%%%%%%%%%% \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.HCPC=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?HCPCanalysis} in your console for more information about the function. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Temporal clustering analysis with MFUZZanalysis()} \label{sec:MFUZZfission} %% \label{sec:MFUZZanalysis} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The following function realizes the temporal clustering analysis. It takes as input, a number of clusters (\texttt{DataNumberCluster}) that can be chosen automatically if \texttt{DataNumberCluster=NULL} and the results of the function \textbf{DATAnormalization()} (see Section~\ref{sec:DATAnormalizationFission}). The lines of code below return for each biological condition %%%%%%%%%%% \begin{itemize} \item the summary of the results of the R function \Rfunction{mfuzz()} from the package Mfuzz. \item the scaled height plot, computed with the \Rfunction{HCPC()} function, and shows the number of clusters chosen automatically (if \texttt{DataNumberCluster=NULL}). If \texttt{Method="hcpc"}, the function plots the scaled within-cluster inertia, but if \texttt{Method="kmeans"}, the function plots the scaled within-cluster inertia. As the number of genes can be very high, we recommend to select \texttt{Method="hcpc"} which is by default. \item the output graphs from the R package \texttt{Mfuzz} showing the most common temporal behavior among all genes for each biological condition. The plots below correspond to the biological condition 'P'. \end{itemize} %%%%%%%%%%% <>= SEresMfuzzYeast <- MFUZZanalysis(SEresNorm=SEresNormYeast, DataNumberCluster=NULL, Method="hcpc", Membership=0.5, Min.std=0.1, Plot.Mfuzz=FALSE, path.result=NULL) @ The graphs are %%%%%%%%%%%%%%%%%%% \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.Mfuzz=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?MFUZZanalysis} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Genes expression profile with DATAplotExpressionGenes()} \label{sec:PlotExpressionFission} %% \label{sec:PlotExpression} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The lines of code below plot, from the results of the function \textbf{DATAnormalization()}, %%%%%%%%%%%%%%%%%%% \begin{itemize} \item the evolution of the 17th gene expression of all three replicates across time (purple lines) \item the evolution of the mean and the standard deviation of the 17th gene expression across time (black lines). \end{itemize} %%%%%%%%%%%%%%%%%%% <>= SEresEVOyeast <- DATAplotExpressionGenes(SEresNorm=SEresNormYeast, Vector.row.gene=c(17), Plot.Expression=TRUE, path.result=NULL) @ The graphs are %%%%%%%%%%%%%%%%%%% \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.Expression=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%%%%%%%%%% If the user wants to select several genes, for instance the 1st, the 2nd, the 17th and the 19th, he needs to set \texttt{Vector.row.gene=c(1,2,17,19)}. Write \texttt{?DATAplotExpressionGenes} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Statistical analysis of the transcriptional response for the four dataset (supervised analysis)} \label{sec:SupervisedAnalysis_Fission} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{DE analysis with DEanalysisGlobal()} \label{sec:DEanalysisFission} %% \label{sec:DEanalysis} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The function below %%%%%%%%%%%%%%%%%%% \begin{itemize} \item returns a data.frame. See subsection \nameref{sec:DEsummaryFission}. \item plots the following graphs %%%%%%%%% \begin{itemize} \item an alluvial graph. See subsection \nameref{sec:DEyeastGRAPHS} \item a graph showing the number of DE genes as a function of time for each temporal group. By temporal group, we mean the sets of genes which are first DE at the same time. See subsection \nameref{sec:DEyeastGRAPHS} \item a barplot showing the number of DE genes (up- or down-regulated) for each time. See subsection \nameref{sec:DEyeastGRAPHS}. \item two upset graphs, realized with the R package UpSetR \cite{Conway2017UpSetR}, showing the number of DE genes belonging to each DE temporal pattern. By temporal pattern, we mean the set of times \(t_i\) such that the gene is DE between \(t_i\) and the reference time \(t_0\). See subsection \nameref{sec:DEyeastGRAPHS}. \end{itemize} %%%%%%%%% \end{itemize} %%%%%%%%%%%%%%%%%%% The commented lines take too much time, uncomment them in order to use the function \Rfunction{DEanalysisGlobal()}. Due to time consuming of the DE analysis, we stored in the object \texttt{Results\_DEanalysis\_sub500} (uncommented lines) a list of three objects %%%%%%%%%%% \begin{itemize} \item \texttt{Results\_DEanalysis\_sub500\$DE\_Schleiss2021\_CLLsub500}, stored the results of \Rfunction{DEanalysisGlobal()} with \texttt{RawCounts\_Schleiss2021\_CLLsub500}. \item \texttt{Results\_DEanalysis\_sub500\$DE\_Antoszewski2022\_MOUSEsub500}, stored the results of \Rfunction{DEanalysisGlobal()} with \texttt{RawCounts\_Antoszewski2022\_MOUSEsub500}. \item \texttt{Results\_DEanalysis\_sub500\$DE\_Leong2014\_FISSIONsub500wt}, stored the results of \Rfunction{DEanalysisGlobal()} with \texttt{RawCounts\_Leong2014\_FISSIONsub500wt}. \end{itemize} %%%%%%%%%%% <>= # DEyeastWt <- DEanalysisGlobal(SEres=SEresPrepFission, log.FC.min=1, # pval.min=0.05, pval.vect.t=NULL, # LRT.supp.info=FALSE, Plot.DE.graph =FALSE, # path.result=NULL, Name.folder.DE=NULL) data("Results_DEanalysis_sub500") DEyeastWt <- Results_DEanalysis_sub500$DE_Leong2014_FISSIONsub500wt @ The graphs are %%%%%%%%%%%%%%%%%%% \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.DE.graph=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?DEanalysisGlobal} in your console for more information about the function. %%---------------------------------------------------------------------------%% \subsubsubsection{Data.frame summarising all the DE analysis (case 2)} \label{sec:DEsummaryFission} %%---------------------------------------------------------------------------%% The output data.frame can be extracted with the following line of code, <>= DEsummaryFission <- SummarizedExperiment::rowData(DEyeastWt) @ As we use abbreviated column names, we propose a glossary in order to help the user to understand meaning of each column. The glossary of the column names can be extracted with the following lines of code, <>= resDEyeast <- S4Vectors::metadata(DEyeastWt)$Results[[2]][[2]] resGlossaryFission <- resDEyeast$Glossary @ and then write \texttt{DEsummaryFission} and \texttt{resGlossaryFission} in the R console. The data.frame contains %%%%%%%%%%%%%%%%%%% \begin{itemize} \item gene names \item pvalues, log2 fold change and DE genes between each time \(t_i\) versus the reference time \(t_0\) (for a total of \(T-1 = 5\) columns). \item a binary column (1 and 0) where 1 means that the gene is DE at least at between one time \(t_i\) versus the reference time \(t_0\). \item a column where each element is succession of 0 and 1. The positions of '1' indicate the set of times \(t_i\) such that the gene is DE between \(t_i\) and the reference time \(t_0\). So each element of the column is what we called previously, a temporal pattern. \end{itemize} %%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{Description of graphs}\label{sec:DEyeastGRAPHS} %%---------------------------------------------------------------------------%% The function returns the following plots %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item An alluvial graph. The x-axis of the alluvial graph is labeled with all times except \(t_0\). For each vertical barplot, there are two strata: 1 and 0 whose sizes indicate respectively the number of DE genes and of non DE genes, between the time corresponding to the barplot and the reference time \(t_0\). %%%%%% \newline The alluvial graph is composed of curves, each corresponding to a single gene, which are gathered in alluvia. An alluvium is composed of all genes having the same curve: for example, an alluvium going from the stratum 0 at time \(t_1\) to the stratum 1 at time \(t_2\) corresponds to the set of genes which are not DE at \(t_1\) and are DE at time \(t_2\). Each alluvium connects pairs of consecutive barplots and its thickness gives the number of genes in the alluvium. %%%%%%%%%%%% The color of each alluvium indicates the temporal group, defined as the set of genes which are all first DE at the same time with respect to the reference time \(t_0\). \item A graph giving the time evolution of the number of DE genes within each temporal group. The x-axis labels indicate all times except \(t_0\). %%%%%%%% \item A barplot showing the number of DE genes per time. The x-axis labels indicate all times except \(t_0\). %%%%%%%%% For each DE gene, we compute the sign of the log2 fold change between time ti and time \(t_0\). If the sign is positive (resp. negative), the gene is categorized as up-regulated (resp. down-regulated). In the graph, the up-regulated (resp. down-regulated) genes are indicated in red (resp. in blue). \item An upset graph, realized with the R package UpSetR \cite{Conway2017UpSetR}, plotting the number of genes in each DE temporal pattern in a Venn barplot. By DE temporal pattern, we mean a subset of times in \(t_1, \ldots, t_n\). We say that a gene belongs to a DE temporal pattern if the gene is DE versus \(t_0\) only at the times in this DE temporal patterns. %%%%%%%%%%%% For each gene in a given DE temporal pattern, we compute the number of DE times where it is up-regulated and we use a color code in the Venn barplot to indicate the number of genes in a temporal pattern that are up-regulated a given number of times. \item The same upset graph is also plotted without colors. \end{enumerate} %%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Volcano plots, ratio intensity (MA) plots and Heatmaps with DEplotVolcanoMA() and DEplotHeatmaps()} \label{sec:DEvolcanoMAheatmapFission} %% \label{sec:DEvolcanoMAheatmap} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{Volcano and MA plots (case 2)}\label{sec:DEvolcanoMAFission} %%---------------------------------------------------------------------------%% The following lines of code allow to plot %%%%%%%%%%%%%%%%%%% \begin{itemize} \item \(T-1=6-1=5\) volcano plots (with \(T=6\) the number time points) \item \(T-1=5\) MA plots \end{itemize} %%%%%%%%%%%%%%%%%%% allowing to separate non DE genes, DE genes below a threshold of log2 fold change and DE genes above a threshold of log2 fold change (see Section~\ref{sec:DEvolcanoMALeuk} for more details). <>= SEresVolcanoMAFission <- DEplotVolcanoMA(SEresDE=DEyeastWt, NbGene.plotted=2, SizeLabel=3, Display.plots=FALSE, Save.plots=TRUE) @ If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item either a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% If the user wants to display the graph, he must set \texttt{Display.plots=TRUE}. Write \texttt{?DEplotVolcanoMA} in your console for more information about the function. \clearpage %%---------------------------------------------------------------------------%% \subsubsubsection{Heatmaps (case 2)}\label{sec:DEheatmapFission} %%---------------------------------------------------------------------------%% The following lines of code allow to plot a correlation heatmap between samples (correlation heatmap) and a heatmap across samples and genes called Zscore heatmap, for a subset of genes that can be selected by the user. The second heatmap is build from the normalized count data after being both centered and scaled (Zscore). <>= SEresHeatmapFission <- DEplotHeatmaps(SEresDE=DEyeastWt, ColumnsCriteria=2, Set.Operation="union", NbGene.analysis=20, Color.Group=NULL, SizeLabelRows=5, SizeLabelCols=5, Display.plots=TRUE, Save.plots=FALSE) @ For the Zscore heatmap, The subset of genes is selected as followss %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryFission} (see Section~\ref{sec:DEsummaryFission}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryFission} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryFission}) included in the SE object \texttt{DEyeastWt} are those such that the sum of the selected columns of \texttt{DEsummaryFission} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryFission}) included in the SE object \texttt{DEyeastWt} are those such that the product of the selected columns of \texttt{DEsummaryFission} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryFission}) included in the SE object \texttt{DEyeastWt} are those such that only one element of the selected columns of \texttt{DEsummaryFission} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item either a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% If the user wants to display the graph, he must set \texttt{Display.plots=TRUE}. Write \texttt{?DEplotHeatmaps} in your console for more information about the function. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Gene Ontology (GO) analysis with GSEAQuickAnalysis() and GSEApreprocessing()} \label{sec:GOenrichmentFission} %% \label{sec:GOenrichment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsection{Gene ontology with the R package gprofiler2} \label{sec:GOenrichmentFissionGprofiler2} %%---------------------------------------------------------------------------%% The lines of code below realize an enrichment analysis with the R package gprofiler2 for a selection of genes. Beware, an internet connection is needed. The function returns %%%%%%%%%%%%%%%%% \begin{itemize} \item a data.frame (output \texttt{metadata(SEresGprofiler2Fission)\$Rgprofiler2\$GSEAresults}) giving information about all detected gene ontologies for the list of associated genes. \item a lollipop graph (see section \nameref{sec:stepsGO}). The y-axis indicates the \texttt{MaxNumberGO} most significant gene ontologies and pathways associated to the selected genes. The gene ontologies and patways are sorted into descending order of \(-log10(pvalues)\). The x-axis indicates the \(-log10(pvalues)\). The higher is a lollipop the more significant is a gene ontology or pathway. A lollipop is yellow if the pvalues is smaller than 0.05 (significant) and blue otherwise. \item A Manhattan plot (see section \nameref{sec:stepsGO}) indicating all genes ontologies ordered according to the functional database (G0::BP, G0::CC, G0::MF and KEGG) \end{itemize} %%%%%%%%%%%%%%%%% <>= SEresGprofiler2Fission <- GSEAQuickAnalysis(Internet.Connection=FALSE, SEresDE=DEyeastWt, ColumnsCriteria=2, ColumnsLog2ordered=NULL, Set.Operation="union", Organism="spombe", MaxNumberGO=20, Background=FALSE, Display.plots=FALSE, Save.plots=FALSE) ## ## head(SEresGprofiler2Fission$GSEAresults) @ As \textbf{GSEAQuickAnalysis()} requires an internet connection, we needed to add the input \texttt{Internet.Connection} in order to be sure to pass the tests realized on our package by Bioconductor. The input \texttt{Internet.Connection} is set by default to \texttt{FALSE} and as long as \texttt{Internet.Connection=FALSE}, no enrichment analysis will be done. Once the user is sure to have an internet connection, the user may set \texttt{Internet.Connection=TRUE} in order to realize the enrichment analysis. The subset of genes is selected as follows %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryFission} (see Section~\ref{sec:DEsummaryFission}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryFission} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryFission}) included in the SE object \texttt{DEyeastWt} are those such that the sum of the selected columns of \texttt{DEsummaryFission} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryFission}) included in the SE object \texttt{DEyeastWt} are those such that the product of the selected columns of \texttt{DEsummaryFission} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryFission}) included in the SE object \texttt{DEyeastWt} are those such that only one element of the selected columns of \texttt{DEsummaryFission} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If \texttt{ColumnsLog2ordered} is a vector of integers, the enrichment analysis will take into account the genes order as the first genes will be considered to have the highest biological importance and the last genes the lowest. It corresponds to the columns number of \texttt{DEsummaryFission}, the output of \Rfunction{DEanalysisGlobal()}, which must contains \(log_2\) fold change values. The rows of \texttt{DEsummaryFission} (corresponding to genes) will be decreasingly ordered according to the sum of absolute \(log_2\) fold change (the selected columns must contain \(log_2\) fold change values) before the enrichment analysis. If \texttt{ColumnsLog2ordered=NULL}, then the enrichment analysis will not take into account the genes order. If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \texttt{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?GSEAQuickAnalysis} in your console for more information about the function. \clearpage %%---------------------------------------------------------------------------%% \subsubsection{Preprocessing for GSEA, DAVID, WebGestalt, gProfiler, Panther, ShinyGO, Enrichr and GOrilla} \label{sec:GOenrichmentFissionPreprocessing} %%---------------------------------------------------------------------------%% The following lines of code will generate all files, for a selection of genes, in order to use the following software and online tools : GSEA, DAVID, WebGestalt, gProfiler, Panther, ShinyGO, Enrichr and GOrilla. <>= SEresPreprocessingYeast <- GSEApreprocessing(SEresDE=DEyeastWt, ColumnsCriteria=2, Set.Operation="union", Rnk.files=FALSE, Save.files=FALSE) @ The subset of genes is selected as follows %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryFission} (see Section~\ref{sec:DEsummaryFission}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryFission} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryFission}) included in the SE object \texttt{DEyeastWt} are those such that the sum of the selected columns of \texttt{DEsummaryFission} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryFission}) included in the SE object \texttt{DEyeastWt} are those such that the product of the selected columns of \texttt{DEsummaryFission} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryFission}) included in the SE object \texttt{DEyeastWt} are those such that only one element of the selected columns of \texttt{DEsummaryFission} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If the user wants to save the files, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \texttt{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?GSEApreprocessing} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \section{Quick description of the analysis of a dataset with several time points and more than two biological conditions (case 4)} \label{sec:Analysis_case3_sup2BC} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In this section we use the mouse subdataset \textbf{RawCounts\_Weger2021\_MOUSEsub500} (see Subsection \nameref{sec:dataWeger2021}) in order to explain the use of our package in \textbf{Case 4} (several times point and more than two biological conditions).\newline Most of the outputs in \textbf{Case 4} are of a similar form as those shown in \textbf{Case 3} (Section~\ref{sec:DetailedAnalysis}), except for some outputs whose list is precisely given below %%% \begin{itemize} \item Subsection~\ref{sec:DEmus2GRAPHSbiocond} page \pageref{sec:DEmus2GRAPHSbiocond} \item Subsection~\ref{sec:DEmus2GRAPHSboth} page \pageref{sec:DEmus2GRAPHSboth}. \end{itemize} %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Preprocessing step with DATAprepSE()} \label{sec:DATAprepSEmus2} %%\label{sec:PreprocessingStep} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The preprocessing step is mandatory and is realized by our R function \textbf{DATAprepSE()} to store all information about the dataset in a standardized way (SummarizedExperiment class object, see Section~\ref{sec:StructureOutput}). It can be done using the following lines of code. <>= SEresPrepMus2 <- DATAprepSE(RawCounts=RawCounts_Weger2021_MOUSEsub500, Column.gene=1, Group.position=1, Time.position=2, Individual.position=3) @ The function returns %%%%%%%%%%%%%%%%%%% \begin{itemize} \item \Rclass{SummarizedExperiment} class object containing all information of the dataset to be used for exploratory data analysis \item \Rclass{DESeqDataSet} class object to be used for statistical analysis of the transcriptional response. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?DATAprepSE} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Exploratory data analysis (unsupervised analysis)} \label{sec:ExploratoryAnalysis_Mus2} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Normalization with DATAnormalization()} \label{sec:DATAnormalizationMus2} %% \label{sec:DATAnormalization} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The following lines of code realize the normalization step from the results of the function \textbf{DATAprepSE()} (subsection \ref{sec:DATAprepSEmus2}). <>= SEresNormMus2 <- DATAnormalization(SEres=SEresPrepMus2, Normalization="vst", Blind.rlog.vst=FALSE, Plot.Boxplot=FALSE, Colored.By.Factors=TRUE, Color.Group=NULL, path.result=NULL) @ If \texttt{Plot.Boxplot=TRUE} a boxplot showing the distribution of the normalized expression\newline (\texttt{Normalization="vst"} means that the vst method is used) of genes for each sample is returned. If \texttt{Colored.By.Factors=TRUE}, the color of the boxplots would be different for different biological conditions. By default (if \texttt{Color.Group=NULL}), a color will be automatically applied for each biological condition. You can change the colors by creating the following data.frame <>= colMus2 <- data.frame(Name=c("BmKo", "BmWt" ,"CrKo", "CrWt"), Col=c("red", "blue", "orange", "darkgreen")) @ and setting \texttt{Color.Group=colMus2}. The x-labels give biological information, time information and individual information separated by dots. If the user wants to see the 6th first rows of the normalized data, he can write in his console \texttt{head(SEresNormMus2\$NormalizedData, n=6)} The user can save the graph in a folder thanks to the input path.result. If \texttt{path.result=NULL} the results will still be plotted but not saved in a folder. Write \texttt{?DATAnormalization} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Factorial analysis: PCA with PCAanalysis() and clustering with HCPCanalysis()} \label{sec:FactorialMus2} %% \label{sec:Factorial} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{PCA (case 4)}\label{sec:PCAMus2} %%---------------------------------------------------------------------------%% When samples belong to different biological conditions and different time points, the previous lines of code return from the results of the function \textbf{DATAnormalization()}: %%%%%%%%%%%%%%%%%%% \begin{itemize} \item The results of the R function \Rfunction{PCA()} from the package FactoMineR. \item one 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}) where samples are colored with different colors for different biological conditions. Furthermore, lines are drawn between each pair of consecutive points for each individual (if \texttt{Mean.Accross.Time=FALSE}, otherwise lines will be drawn only between mean values of all individuals for each time point and biological conditions). \item one 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}) for each biological condition, where samples are colored with different colors for different time points. Furthermore, lines are drawn between each pair of consecutive points for each sample (if \texttt{Mean.Accross.Time=FALSE}, otherwise lines will be drawn only between mean values of all individuals for each time point and biological conditions). \item the same graphs described above but without lines. \end{itemize} %%%%%%%%%%%%%%%%%%% <>= SEresPCAmus2 <- PCAanalysis(SEresNorm=SEresNormMus2, gene.deletion=NULL, sample.deletion=NULL, Plot.PCA=FALSE, motion3D=FALSE, Mean.Accross.Time=FALSE, Color.Group=NULL, Cex.label=0.6, Cex.point=0.7, epsilon=0.2, Phi=25, Theta=140, path.result=NULL, Name.folder.pca=NULL) @ The graphs are %%%%%%%%%%%%%%%%%%% \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.PCA=TRUE} \item similar to those described in subsection \nameref{sec:PCALeuk}. \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%%%%%%%%%% By default (if \texttt{Color.Group=NULL}), a color will be automatically applied for each biological condition. You can change the colors by creating the following data.frame <>= colMus2 <- data.frame(Name=c("BmKo", "BmWt", "CrKo", "CrWt"), Col=c("red", "blue", "orange", "darkgreen")) @ and setting \texttt{Color.Group=colMus2}. The user cannot change the color associated to each time point. If you want to delete, for instance, the genes 'ENSMUSG00000025921' and 'ENSMUSG00000026113' (respectively the second and sixth gene) and/or delete the samples 'BmKo\_t2\_r1' and 'BmKo\_t5\_r2', set %%%%%%%%%%%%%%%%%%% \begin{itemize} \item \texttt{gene.deletion=c("ENSMUSG00000025921", "ENSMUSG00000026113")} and/or\newline \texttt{sample.deletion=c("BmKo\_t2\_r1", "BmKo\_t5\_r2")} \item \texttt{gene.deletion=c(2,6)} and/or \texttt{sample.deletion=c(3,13)}.\newline The integers in \texttt{gene.deletion} and \texttt{sample.deletion} represent respectively the row numbers and the column numbers of \texttt{RawCounts} where the selected genes and samples are located. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?PCAanalysis} in your console for more information about the function. %%---------------------------------------------------------------------------%% \subsubsubsection{HCPC (case 4)} \label{sec:HCPCMus2} %%---------------------------------------------------------------------------%% The user can realize the clustering with HCPC using the function \textbf{HCPCanalysis()} as below. The following lines of code return from the results of the function \textbf{DATAnormalization()} %%%%% \begin{itemize} \item The results of the R function \Rfunction{HCPC()} from the package FactoMineR. \item A dendrogram \item A graph indicating for each sample, its cluster and the biological condition associated to the sample, using a color code \item One 2D PCA graph, one 3D PCA graph and the same 3D PCA graph in a rgl window (only if \texttt{motion3D=FALSE}). These PCA graphs are identical to the outputs of \textbf{PCAanalysis()} but samples are colored with different colors for different clusters. \end{itemize} <>= SEresHCPCmus2 <- HCPCanalysis(SEresNorm=SEresNormMus2, gene.deletion=NULL, sample.deletion=NULL, Plot.HCPC=FALSE, Phi=25,Theta=140, Cex.point=0.6, epsilon=0.2, Cex.label=0.6, motion3D=FALSE, path.result=NULL, Name.folder.hcpc=NULL) @ The graphs are %%%%%%%%%%%%%%%%%%% \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.HCPC=TRUE} \item similar to those described in subsection \nameref{sec:HCPCLeuk}. \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%%%%%%%%%% %% The optimal number of clusters is automatically computed by the R function %% \Rfunction{HCPC()}. Write \texttt{?HCPCanalysis} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Temporal clustering analysis with MFUZZanalysis()} \label{sec:MFUZZmus2} %%\label{sec:MFUZZanalysis} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The following function realizes the temporal clustering analysis. It takes as input, a number of clusters (\texttt{DataNumberCluster}) that can be chosen automatically if \texttt{DataNumberCluster=NULL} and the results of the function \textbf{DATAnormalization()} (see Section~\ref{sec:DATAnormalizationMus2}). The lines of code below return for each biological condition %%%%%%%%%%% \begin{itemize} \item the summary of the results of the R function \Rfunction{mfuzz()} from the package Mfuzz. \item the scaled height plot, computed with the \Rfunction{HCPC()} function, and shows the number of clusters chosen automatically (if \texttt{DataNumberCluster=NULL}). If \texttt{Method="hcpc"}, the function plots the scaled within-cluster inertia, but if \texttt{Method="kmeans"}, the function plots the scaled within-cluster inertia. As the number of genes can be very high, we recommend to select \texttt{Method="hcpc"} which is by default. \item the output graphs from the R package \texttt{Mfuzz} showing the most common temporal behavior among all genes for each biological condition. The plots below correspond to the biological condition 'P'. \end{itemize} %%%%%%%%%%% <>= SEresMfuzzLeuk500 <- MFUZZanalysis(SEresNorm=SEresNormMus2, DataNumberCluster=NULL, Method="hcpc", Membership=0.5, Min.std=0.1, Plot.Mfuzz=FALSE, path.result=NULL, Name.folder.mfuzz=NULL) @ The graphs are %%%%%%%%%%%%%%%%%%% \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.Mfuzz=TRUE} \item similar to those described in subsection \nameref{sec:MFUZZleuk}. \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%%%%%%%%%% Other temporal information are shown in the alluvial graph of the subsection \nameref{sec:DEanalysis} that can be compared with the previous graphs. Write \texttt{?MFUZZanalysis} in your console for more information about the function. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Plot expression of data with DATAplotExpressionGenes()} \label{sec:PlotExpressionMus2} %% \label{sec:PlotExpression} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In this section we use the mouse subdataset \textbf{RawCounts\_Weger2021\_MOUSEsub500} (see Subsection \nameref{sec:dataWeger2021}) in order to explain \textbf{DATAplotExpressionGenes()} in \textbf{case 3} when there are more than two biological conditions. The previous lines of code allow to plot, from the results of the function \textbf{DATAnormalization()}, for each biological condition: the evolution of the 17th gene expression of the three replicates across time and the evolution of the mean and the standard deviation of the 17th gene expression across time. The color of the different lines are different for different biological conditions. <>= SEresEVOmus2 <- DATAplotExpressionGenes(SEresNorm=SEresNormMus2, Vector.row.gene=c(17), Color.Group=NULL, Plot.Expression=FALSE, path.result=NULL) @ The graphs are %%%%%%%%%%%%%%%%%%% \begin{itemize} \item stored in an SE object \item displayed if \texttt{Plot.Expression=TRUE} \item saved in a folder if the user selects a folder path in \texttt{path.result}. If \texttt{path.result=NULL} the results will not be saved in a folder. \end{itemize} %%%%%%%%%%%%%%%%%%% By default (if \texttt{Color.Group=NULL}), a color will be automatically assigned to each biological condition. The user can change the colors by creating the following data.frame <>= colMus2 <- data.frame(Name=c("BmKo", "BmWt", "CrKo", "CrWt"), Col=c("red", "blue", "orange", "darkgreen")) @ and setting \texttt{Color.Group=colMus2}. If the user wants to select several genes, for instance the 97th, the 192th, the 194th and the 494th, he needs to set \texttt{Vector.row.gene=c(97,192,194,494)}. Write \texttt{?DATAplotExpressionGenes} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsection{Statistical analysis of the transcriptional response for the four dataset (supervised analysis)} \label{sec:SupervisedAnalysis_Mus2} %%---------------------------------------------------------------------------%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{DE analysis with DEanalysisGlobal()} \label{sec:DEanalysisMus2} %% \label{sec:DEanalysis} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% To keep the execution time of the algorithm fast, we will take only three biological conditions and three times. <>= Sub3bc3T <- RawCounts_Weger2021_MOUSEsub500[, seq_len(73)] SelectTime <- grep("_t0_", colnames(Sub3bc3T)) SelectTime <- c(SelectTime, grep("_t1_", colnames(Sub3bc3T))) SelectTime <- c(SelectTime, grep("_t2_", colnames(Sub3bc3T))) Sub3bc3T <- Sub3bc3T[, c(1, SelectTime)] SEresPrepMus23b3t <- DATAprepSE(RawCounts=Sub3bc3T, Column.gene=1, Group.position=1, Time.position=2, Individual.position=3) @ The lines of code above %%%%%%%%%%%%%%%%%%% \begin{itemize} \item return a data.frame. See subsection \nameref{sec:DEsummaryLeuk}. \item plot the following graphs (similar to those described in section \nameref{sec:DEanalysisLeuk}) %%%%%%%% \begin{itemize} \item \emph{Results from the temporal statistical analysis} (\textbf{case 2} for each biological condition). See subsection \nameref{sec:DEleukGRAPHStemporel} \item \emph{Results from the statistical analysis by biological condition} (\textbf{case 1} for each fixed time). See subsection \nameref{sec:DEleukGRAPHSbiocond}. \item \emph{Results from the combination of temporal and biological statistical analysis}. See subsection \nameref{sec:DEleukGRAPHSboth} \end{itemize} %%%%%%%% \end{itemize} %%%%%%%%%%%%%%%%%%% <>= SEresDE3tMus2 <- DEanalysisGlobal(SEres=SEresPrepMus23b3t, pval.min=0.05, pval.vect.t=NULL, log.FC.min=1, LRT.supp.info=FALSE, Plot.DE.graph=FALSE, path.result=NULL, Name.folder.DE=NULL) @ The output data.frame can be extracted with the following line of code, <>= DEsummaryMus2 <- SummarizedExperiment::rowData(SEresDE3tMus2) @ As we use abbreviated column names, we propose a glossary in order to help the user to understand meaning of each column. The glossary of the column names can be extracted with the following lines of code, <>= resDEmus2 <- S4Vectors::metadata(SEresDE3tMus2)$Results[[2]][[2]] resGlossaryMus2 <- resDEmus2$Glossary @ and then write \texttt{DEsummaryMus2} and \texttt{resGlossaryMus2} in the R console. The following subsection will show graphs not shown in \nameref{sec:DEanalysisLeuk}. %%---------------------------------------------------------------------------%% \subsubsubsection{Graphs from the results of the temporal statistical analysis} \label{sec:DEmus2GRAPHStemporel} %%---------------------------------------------------------------------------%% As we are in the similar case described in section \nameref{sec:DEleukGRAPHStemporel}, the results will not be described here. %%---------------------------------------------------------------------------%% \subsubsubsection{Graphs from the results of the biological condition analysis} \label{sec:DEmus2GRAPHSbiocond} %%---------------------------------------------------------------------------%% As we are in case 4, the results are more complex. \noindent \emph{One barplot} showing the number of specific genes per biological condition and no specific genes (category 'Other'), for each time. <>= resDEmus2$DEplots_GroupPerTime$NumberDEgenes_SpecificAndNoSpecific_perBiologicalCondition @ \noindent \emph{One barplot} showing the number of specific genes per biological condition, for each time. This is the same barplot than in section \nameref{sec:DEleukGRAPHSbiocond} <>= resDEmus2$DEplots_GroupPerTime$NumberSpecificGenes_UpDownRegulated_perBiologicalCondition @ See Section~\nameref{sec:DEleukGRAPHSbiocond} for more information about specific genes. \noindent \emph{\(\binom{N_{bc}}{2}=\binom{3}{2}=3\) upset graph} showing the number of genes corresponding to each possible intersection in a Venn barplot at a given time, only if there are more than two biological conditions (which the case here). We recall that a set of pairs of biological conditions forms an intersection at a given time \(t_i\) when there is at least one gene which is DE for each of these pairs of biological conditions at time \(t_i\), but not for the others at time \(t_i\). The following line of code plots the Venn barplot for the time \(t_1\). <>= resDEmus2$DEplots_GroupPerTime$VennBarplot_BiologicalConditions_atTime.t1 @ \noindent \emph{alluvial graph} showing the number of DE genes which are specific at least at one time for each group, only if there are more than two biological conditions (which is the case here). <>= resDEmus2$DEplots_GroupPerTime$AlluvialGraph_SpecificGenes1tmin_perBiologicalCondition @ %%---------------------------------------------------------------------------%% \subsubsubsection{Graphs from the results of the combination of temporal and biological statistical analysis} \label{sec:DEmus2GRAPHSboth} %%---------------------------------------------------------------------------%% From the combination of temporal and biological statistical analysis, the function plots the following graphs. The only difference with the section~\nameref{sec:DEleukGRAPHSboth} is the following graph. \noindent \emph{One alluvial graph} for DE genes which are signature at least at one time for each biological condition, only if there are more than two biological conditions (which is not the case here). <>= print(resDEmus2$DEplots_TimeAndGroup$Alluvial_SignatureGenes_1TimeMinimum_perGroup) @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Volcano plots, ratio intensity (MA) plots and Heatmaps with DEplotVolcanoMA() and DEplotHeatmaps()} \label{sec:DEvolcanoMAheatmapMus2} %% \label{sec:DEvolcanoMAheatmap} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsubsection{Volcano and MA plots (case 4)} \label{sec:DEvolcanoMAMus2} %%---------------------------------------------------------------------------%% The following lines of code allow to plot %%%%%%%%%%%%%%%%%%% \begin{itemize} \item \(\dbinom{N_{bc}}{2} \times T + (T-1)\times N_{bc} = \frac{N_{bc}(N_{bc}-1)}{2}\times T + (T-1)\times N_{bc}=56\) volcano plots (with \(N_{bc}=4\) the number of biological conditions and \(T=6\) the number of time points). \item \(\dbinom{N_{bc}}{2} \times T + (T-1)\times N_{bc}=56\) MA plots. \end{itemize} %%%%%%%%%%%%%%%%%%% allowing to separate non DE genes, DE genes below a threshold of log2 fold change and DE genes above a threshold of log2 fold change (see Section~\ref{sec:DEvolcanoMALeuk} for more details). <>= SEresVolcanoMAmus2 <- DEplotVolcanoMA(SEresDE=SEresDE3tMus2, NbGene.plotted=2, SizeLabel=3, Display.plots=FALSE, Save.plots=FALSE) @ For more details, see Section~\ref{sec:DEvolcanoMAheatmap_leuk}. If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% If the user wants to display the graph, he must set \texttt{Display.plots=TRUE}. Write \texttt{?DEplotVolcanoMA} in your console for more information about the function. %%---------------------------------------------------------------------------%% \subsubsubsection{Heatmaps (case 4)} \label{sec:DEheatmapMus2} %%---------------------------------------------------------------------------%% The following lines of code allow to plot a correlation heatmap between samples (correlation heatmap) and a heatmap across samples and genes called Zscore heatmap, for a subset of genes that can be selected by the user. The second heatmap is build from the normalized count data after being both centered and scaled (Zscore). <>= SEresHeatmapMus2 <- DEplotHeatmaps(SEresDE=SEresDE3tMus2, ColumnsCriteria=2:5, Set.Operation="union", NbGene.analysis=20, SizeLabelRows=5, SizeLabelCols=5, Display.plots=FALSE, Save.plots=FALSE) @ Both graphs are similar to those described in subsection \nameref{sec:DEheatmapFission} and \nameref{sec:DEheatmapMus1}. For the Zscore heatmap, The subset of genes is selected as followss %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryMus2} (see Section~\ref{sec:DEanalysisMus2}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryMus2} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus2}) included in the SE object \texttt{SEresDE3tMus2} are those such that the sum of the selected columns of \texttt{DEsummaryMus2} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus2}) included in the SE object \texttt{SEresDE3tMus2} are those such that the product of the selected columns of \texttt{DEsummaryMus2} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus2}) included in the SE object \texttt{SEresDE3tMus2} are those such that only one element of the selected columns of \texttt{DEsummaryMus2} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item either a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% If the user wants to display the graph, he must set \texttt{Display.plots=TRUE}. Write \texttt{?DEplotHeatmaps} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Gene Ontology (GO) analysis with GSEAQuickAnalysis() and GSEApreprocessing()} \label{sec:GOenrichmentMus2} %% \label{sec:GOenrichment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%---------------------------------------------------------------------------%% \subsubsection{Gene ontology with the R package gprofiler2} \label{sec:GOenrichmentMus2Gprofiler2} %%---------------------------------------------------------------------------%% The lines of code below realize an enrichment analysis with the R package gprofiler2 for a selection of genes. Beware, an internet connection is needed. The function returns %%%%%%%%%%%%%%%%% \begin{itemize} \item a data.frame (output \texttt{metadata(SEresGprofiler2Mus2)\$Rgprofiler2\$GSEAresults}) giving information about all detected gene ontologies for the list of associated genes. \item a lollipop graph (see section \nameref{sec:stepsGO}). The y-axis indicates the \texttt{MaxNumberGO} most significant gene ontologies and pathways associated to the selected genes. The gene ontologies and patways are sorted into descending order. The x-axis indicates the \(-log10(pvalues)\). The higher is a lollipop the more significant is a gene ontology or pathway. A lollipop is yellow if the pvalues is smaller than 0.05 (significant) and blue otherwise. \item A Manhattan plot (see section \nameref{sec:stepsGO}) indicating all genes ontologies ordered according to the functional database (G0::BP, G0::CC, G0::MF and KEGG) \end{itemize} %%%%%%%%%%%%%%%%% <>= SEresGprofiler2Mus2 <- GSEAQuickAnalysis(Internet.Connection=FALSE, SEresDE=SEresDE3tMus2, ColumnsCriteria=2:5, ColumnsLog2ordered=NULL, Set.Operation="union", Organism="mmusculus", MaxNumberGO=20, Background=FALSE, Display.plots=FALSE, Save.plots=FALSE) ## ## head(SEresGprofiler2Mus2$GSEAresults) @ As \textbf{GSEAQuickAnalysis()} requires an internet connection, we needed to add the input \texttt{Internet.Connection} in order to be sure to pass the tests realized on our package by Bioconductor. The input \texttt{Internet.Connection} is set by default to \texttt{FALSE} and as long as \texttt{Internet.Connection=FALSE}, no enrichment analysis will be done. Once the user is sure to have an internet connection, the user may set \texttt{Internet.Connection=TRUE} in order to realize the enrichment analysis. \clearpage The subset of genes is selected as follows %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryMus2} (see Section~\ref{sec:DEanalysisMus2}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryMus2} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus2}) included in the SE object \texttt{SEresDE3tMus2} are those such that the sum of the selected columns of \texttt{DEsummaryMus2} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus2}) included in the SE object \texttt{SEresDE3tMus2} are those such that the product of the selected columns of \texttt{DEsummaryMus2} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus2}) included in the SE object \texttt{SEresDE3tMus2} are those such that only one element of the selected columns of \texttt{DEsummaryMus2} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If \texttt{ColumnsLog2ordered} is a vector of integers, the enrichment analysis will take into account the genes order as the first genes will be considered to have the highest biological importance and the last genes the lowest. It corresponds to the columns number of \texttt{DEsummaryMus2}, the output of \Rfunction{DEanalysisGlobal()}, which must contains \(log_2\) fold change values. The rows of \texttt{DEsummaryMus2} (corresponding to genes) will be decreasingly ordered according to the sum of absolute \(log_2\) fold change (the selected columns must contain \(log_2\) fold change values) before the enrichment analysis. If \texttt{ColumnsLog2ordered=NULL}, then the enrichment analysis will not take into account the genes order. If the user wants to save the graphs, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?GSEAQuickAnalysis} in your console for more information about the function. \clearpage %%---------------------------------------------------------------------------%% \subsubsection{Preprocessing for GSEA, DAVID, WebGestalt, gProfiler, Panther, ShinyGO, Enrichr and GOrilla} \label{sec:GOenrichmentMus2preprocessing} %%---------------------------------------------------------------------------%% The following lines of code will generate all files, for a selection of genes, in order to use the following software and online tools : GSEA, DAVID, WebGestalt, gProfiler, Panther, ShinyGO, Enrichr and GOrilla. <>= SEresPreprocessingMus2 <- GSEApreprocessing(SEresDE=SEresDE3tMus2, ColumnsCriteria=2:5, Set.Operation="union", Rnk.files=FALSE, Save.files=TRUE) @ The subset of genes is selected as follows %%%%%%%%%%%%%%%%%%% \begin{enumerate} \item the user selects one or more binary column of the data.frame \texttt{DEsummaryMus2} (see Section~\ref{sec:DEanalysisMus2}) with the input \texttt{ColumnsCriteria} which contains the column numbers of \texttt{DEsummaryMus2} to be selected. \item Three cases are possible: %%%%%%% \begin{itemize} \item If \texttt{Set.Operation="union"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus2}) included in the SE object \texttt{SEresDE3tMus2} are those such that the sum of the selected columns of \texttt{DEsummaryMus2} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having at least one '1' in one of the selected columns. \item If \texttt{Set.Operation="intersect"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus2}) included in the SE object \texttt{SEresDE3tMus2} are those such that the product of the selected columns of \texttt{DEsummaryMus2} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in all of the selected columns. \item If \texttt{Set.Operation="setdiff"} then the rows extracted from the different datasets (raw counts and normalized data and \texttt{DEsummaryMus2}) included in the SE object \texttt{SEresDE3tMus2} are those such that only one element of the selected columns of \texttt{DEsummaryMus2} given in \texttt{ColumnsCriteria} is >0. This means that the selected genes are those having '1' in only one of the selected columns. \end{itemize} %%%%%%% \item Finally, the selected subset of genes will be the \texttt{NbGene.analysis} genes extracted in step 2 above, which have the highest sum of absolute log2 fold change. \end{enumerate} %%%%%%%%%%%%%%%%%%% If the user wants to save the files, the input \texttt{Save.plots} must be %%%%%%%%%%%%%%%%%%% \begin{itemize} \item either \texttt{Save.plots=TRUE}, and the graph will be saved in the same location than the input \texttt{path.result} of the function \Rfunction{DEanalysisGlobal()}. \item or a strings of characters giving the path to a folder where the graphs will be saved. The user then chooses the path of the folder where results can be saved. \end{itemize} %%%%%%%%%%%%%%%%%%% Write \texttt{?GSEApreprocessing} in your console for more information about the function. \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Session info}\label{sec:sessioninfo} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Here is the output of \Rfunction{sessionInfo()} on the system on which this document was compiled. %%%% <>= utils::toLatex(sessionInfo()) @ \newpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %\section*{Bibliography} \bibliographystyle{apalike}%plain \bibliography{MultiRNAflowBiblio} %% \bibliography{vignettes/MultiRNAflowBiblio} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \end{document}