---
title: "Guided tutorial"
author: 
- name: "Silvia Giulia Galfrè"
  affiliation: "Department of Biology, University of Rome Tor Vergata"
- name: "Francesco Morandin"
  affiliation: "Department of Mathematical, Physical and Computer Sciences, University of Parma"
- name: "Marco  Pietrosanto"
  affiliation: "Department of Biology, University of Rome Tor Vergata"
- name: "Federico Cremisi"
  affiliation: "Scuola Normale Superiore di Pisa"
- name: "Manuela Helmer-Citterich"
  affiliation: "Department of Biology, University of Rome Tor Vergata"
package: COTAN
output: 
    BiocStyle::html_document
vignette: >
  %\VignetteIndexEntry{Guided tutorial}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8} 
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 5,
  fig.height = 5
)

```

```{r setup}
library(COTAN)
library(data.table)
library(Matrix)
library(ggrepel)
library(factoextra)
library(Rtsne)
library(utils)
library(plotly)
library(tidyverse)
library(htmlwidgets)
library(MASS)
library(dendextend)
```

```{r}
mycolours <- c("A" = "#8491B4B2","B"="#E64B35FF")
my_theme = theme(axis.text.x = element_text(size = 14, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
                 axis.text.y = element_text( size = 14, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),
                 axis.title.x = element_text( size = 14, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
                 axis.title.y = element_text( size = 14, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"))
data_dir = tempdir()
```


```{r eval=FALSE, include=FALSE}
Download the dataset for mouse cortex E17.5.
if (!file.exists(paste("inst/extdata/","E175_only_cortical_cells.txt.gz", sep = "/"))) {
  download.file("ftp://ftp.ncbi.nlm.nih.gov/geo/
                samples/GSM2861nnn/GSM2861514/suppl/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz",
              paste(data_dir,"E175_only_cortical_cells.txt.gz", sep = "/"),method = "wget", quiet = FALSE)
  
}
```

This is just an example on a toy dataset. If the user want to try on a real dataset it can be used the previous command to download it and the next commented line to use it as input data.
There are also available online many other examples at [link](https://seriph78.github.io/Cotan_paper/index.html).

```{r}
data("ERCCraw", package = "COTAN")

ERCCraw = as.data.frame(ERCCraw)
rownames(ERCCraw) = ERCCraw$V1
ERCCraw = ERCCraw[,2:ncol(ERCCraw)]
ERCCraw[1:5,1:5]
```

Define a directory where the output will be stored.
```{r}
out_dir <- paste0(tempdir(),"/")
```

# Analytical pipeline

Inizialise the COTAN object with the row count table and the metadata for the experiment.

```{r}
obj = new("scCOTAN",raw = ERCCraw)
#obj = initRaw(obj,GEO="GSM2861514" ,sc.method="Drop_seq",cond = "mouse cortex E17.5")
obj = initRaw(obj,GEO="ERCC" ,sc.method="10X",cond = "negative ERCC dataset")
```

Now we can start the cleaning. Analysis requires and starts from a matrix of raw UMI counts after removing possible cell doublets or multiplets and low quality or dying cells (with too high mtRNA percentage, easily done with Seurat or other tools).

If we do not want to consider the mitochondrial genes we can remove them before starting the analysis.
```{r}
genes_to_rem = get.genes(obj)[grep('^mt', get.genes(obj))] 
cells_to_rem = names(get.cell.size(obj)[which(get.cell.size(obj) == 0)])
obj = drop.genes.cells(obj,genes_to_rem,cells_to_rem )
```
We want also to define a prefix to identify the sample.

```{r}
#t = "E17.5_cortex"
t = "ERCC"

print(paste("Condition ",t,sep = ""))
#--------------------------------------
n_cells = length(get.cell.size(object = obj))
print(paste("n cells", n_cells, sep = " "))

n_it = 1
```

## Data cleaning
First we create the directory to store all information regarding the data cleaning.
```{r}
if(!file.exists(out_dir)){
  dir.create(file.path(out_dir))
}

if(!file.exists(paste(out_dir,"cleaning", sep = ""))){   
  dir.create(file.path(out_dir, "cleaning"))
}
```

```{r}
ttm = clean(obj)

obj = ttm$object

ttm$pca.cell.2
```

Run this when B cells need to be removed.
```{r eval=FALSE, include=TRUE}
pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
  geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
                  nudge_y      = 0.05,
                  nudge_x      = 0.05,
                  direction    = "x",
                  angle        = 90,
                  vjust        = 0,
                  segment.size = 0.2)+
  ggtitle("B cell group genes mean expression")+my_theme +
  theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 5,hjust = 0.02 ))
dev.off()

if (length(ttm$cl1) < length(ttm$cl2)) {
  to_rem = ttm$cl1
}else{
  to_rem = ttm$cl2
}
n_it = n_it+1
obj = drop.genes.cells(object = obj,genes = c(),cells = to_rem)

gc()

ttm = clean(obj)
#ttm = clean.sqrt(obj, cells)
obj = ttm$object

ttm$pca.cell.2

```


Run this only in the last iteration, instead the previous code, when B cells group has not to be removed
```{r}
pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
  geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
                  nudge_y      = 0.05,
                  nudge_x      = 0.05,
                  direction    = "x",
                  angle        = 90,
                  vjust        = 0,
                  segment.size = 0.2)+
  ggtitle(label = "B cell group genes mean expression", subtitle = " - B group NOT removed -")+my_theme +
  theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 10,hjust = 0.02 ),
        plot.subtitle = element_text(color = "darkred",vjust = - 15,hjust = 0.01 ))

dev.off()
```

To color the pca based on nu_j (so the cells' efficiency)

```{r}
nu_est = round(get.nu(object = obj), digits = 7)

plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))

plot.nu = plot.nu + geom_point(size = 1,alpha= 0.8)+
  scale_color_gradient2(low = "#E64B35B2",mid =  "#4DBBD5B2", high =  "#3C5488B2" ,
                        midpoint = log(mean(nu_est)),name = "ln(nu)")+
  ggtitle("Cells PCA coloured by cells efficiency") +
  my_theme +  theme(plot.title = element_text(color = "#3C5488FF", size = 20),
                    legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
                    legend.text = element_text(color = "#3C5488FF", size = 11),
                    legend.key.width = unit(2, "mm"),
                    legend.position="right")

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored.pdf", sep = "")))
plot.nu
dev.off()

plot.nu
```

The next part is use to remove the cells with efficiency too low.
```{r}
nu_df = data.frame("nu"= sort(get.nu(obj)), "n"=c(1:length(get.nu(obj))))

ggplot(nu_df, aes(x = n, y=nu)) + 
    geom_point(colour = "#8491B4B2", size=1)+
    my_theme #+ ylim(0,1) + xlim(0,70)

```

We can zoom on the smallest values and, if we detect a clear elbow, we can decide to remove the cells.
```{r}
yset = 0.4#threshold to remove low UDE cells
plot.ude <- ggplot(nu_df, aes(x = n, y=nu)) + 
    geom_point(colour = "#8491B4B2", size=1) + 
    my_theme + ylim(0,1) + xlim(0,400) +
    geom_hline(yintercept=yset, linetype="dashed", color = "darkred") +
    annotate(geom="text", x=200, y=0.25, 
             label=paste("to remove cells with nu < ",yset,sep = " "), 
             color="darkred", size=4.5)


pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
plot.ude
dev.off()

plot.ude
```

We also save the defined threshold in the metadata and re run the estimation
```{r}
obj = add.row.to.meta(obj,c("Threshold low UDE cells:",yset)) 

to_rem = rownames(nu_df[which(nu_df$nu < yset),])

obj = drop.genes.cells(object = obj, genes = c(),cells =  to_rem)
```

Repeat the estimation after the cells are removed

```{r}
ttm = clean(obj)
obj = ttm$object
ttm$pca.cell.2
```

In case we do not want to remove anything, we can run:

```{r}
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
ggplot(nu_df, aes(x = n, y=nu)) + geom_point(colour = "#8491B4B2", size=1) +my_theme + #xlim(0,100)+
  annotate(geom="text", x=50, y=0.25, label="nothing to remove ", color="darkred")
dev.off()
```

Just to check again, we plot the final efficiency colored PCA

```{r}
nu_est = round(get.nu(object = obj), digits = 7)
plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))
plot.nu = plot.nu + geom_point(size = 2,alpha= 0.8)+
  scale_color_gradient2(low = "#E64B35B2",mid =  "#4DBBD5B2", high =  "#3C5488B2" ,
                        midpoint = log(mean(nu_est)),name = "ln(nu)")+
  ggtitle("Cells PCA coloured by cells efficiency: last") +
  my_theme +  theme(plot.title = element_text(color = "#3C5488FF", size = 20),
                    legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
                    legend.text = element_text(color = "#3C5488FF", size = 11),
                    legend.key.width = unit(2, "mm"),
                    legend.position="right")

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored_FINAL.pdf", sep = "")))
plot.nu
dev.off()

plot.nu
```

## COTAN analysis

In this part all the contingency tables are computed and used to get the statistics (S)
To storage efficiency of all the observed tables only the yes_yes is stored. 
Note that if will be necessary re-computing the yes-yes table, the yes-yes table should be cancelled before running cotan_analysis.

```{r}
obj = cotan_analysis(obj,cores = 2)
```

COEX evaluation and storing 
```{r}
obj = get.coex(obj)

# saving the structure 
saveRDS(obj,file = paste(out_dir,t,".cotan.RDS", sep = ""))
```

# Analysis of the elaborated data

## GDI

The next function can directly plot the GDI for the dataset with the 1.5 threshold (in red) and the two higher quantiles (in blue). 

```{r}
plot_GDI(obj, cond = "ERCC")
```

If a more complex plot is needed, or if we want to analyze more in detail the GDI data, we can get directly the GDI dataframe. 

```{r}
quant.p = get.GDI(obj)

head(quant.p)
```

In the third column of this dataframe is reported the percentage of cells expressing the gene.

Next we can define some gene sets (in this case three) that we want to specifically label in the GDI plot.

```{r echo=TRUE}
AA=c("ERCC-00012","ERCC-00013","ERCC-00014") 
BB=c("ERCC-00016","ERCC-00017","ERCC-00019")
CC=c("ERCC-00022","ERCC-00024","ERCC-00028")

text.size = 12

quant.p$highlight = with(quant.p, ifelse(rownames(quant.p) %in% AA, "AA",
                                                 ifelse(rownames(quant.p) %in% CC,"Constitutive" ,
                                                               ifelse(rownames(quant.p) %in% BB,"BB" , "normal"))))

textdf <- quant.p[rownames(quant.p) %in% c(AA,CC,BB), ]
mycolours <- c("Con" = "#00A087FF","AA"="#E64B35FF","BB"="#F39B7FFF","normal" = "#8491B4B2")
f1 = ggplot(subset(quant.p,highlight == "normal" ), aes(x=sum.raw.norm, y=GDI)) +  geom_point(alpha = 0.1, color = "#8491B4B2", size=2.5)
GDI_plot = f1 +  geom_point(data = subset(quant.p,highlight != "normal"  ), aes(x=sum.raw.norm, y=GDI, colour=highlight),size=2.5,alpha = 0.8) +
  geom_hline(yintercept=quantile(quant.p$GDI)[4], linetype="dashed", color = "darkblue") +
  geom_hline(yintercept=quantile(quant.p$GDI)[3], linetype="dashed", color = "darkblue") +
  geom_hline(yintercept=1.5, linetype="dotted", color = "red", size= 0.5) +
  scale_color_manual("Status", values = mycolours)  +
  scale_fill_manual("Status", values = mycolours)  +
  xlab("log normalized counts")+ylab("GDI")+
  geom_label_repel(data = textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf),fill=highlight),
                   label.size = NA, 
                   alpha = 0.5, 
                   direction ="both",
                   na.rm=TRUE,
                   seed = 1234) +
  geom_label_repel(data =textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf)),
                   label.size = NA, 
                   segment.color = 'black',
                   segment.size = 0.5,
                   direction = "both",
                   alpha = 0.8, 
                   na.rm=TRUE,
                   fill = NA,
                   seed = 1234) +
  theme(axis.text.x = element_text(size = text.size, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
        axis.text.y = element_text( size = text.size, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),  
        axis.title.x = element_text( size = text.size, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
        axis.title.y = element_text( size = text.size, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"),
        legend.title = element_blank(),
        legend.text = element_text(color = "#3C5488FF",face ="italic" ),
        legend.position = "bottom") 
legend <- cowplot::get_legend(GDI_plot)
GDI_plot =GDI_plot + theme(
        legend.position = "none") 
GDI_plot

```


## Heatmaps

For the Gene Pair Analysis, we can plot an heatmap with the coex values between two genes sets.
To do so we need to define, in a list, the different gene sets (list.genes). Then in the function parameter sets we can decide which sets need to be considered (in the example from 1 to 3). In the condition parameter we should insert an array with each file name prefix to be considered (in the example, the file names is "E17.5_cortex").


```{r}
list.genes = list("Ref.col"= BB, "AA"=AA, "Const."=CC )

plot_heatmap(df_genes = list.genes,sets = c(1:3),conditions = "ERCC",dir = out_dir)
```

We can also plot a general heatmap of coex values based on some markers like the following one.

```{r}
plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"), condition = "ERCC",dir = out_dir, p_value = 0.05)
```
```{r}
plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"),markers.list =c("ERCC-00084" ,"ERCC-00085" ,"ERCC-00086") ,symmetric = FALSE, condition = "ERCC",dir = out_dir, p_value = 0.05)
```

## Get data tables

Sometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes. To get them we can use two functions:

1. get.observed.ct to produce the observed data
```{r}
 get.observed.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
```

2. get.expected.ct to produce the expected values
```{r}
get.expected.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
```

Another useful function is extract.coex. This can be used to extract the whole or a partial coex matrix from a COTAN object.  
```{r}
# For the whole matrix
coex <- extract.coex(object = obj)
coex[1:5,1:5]
```

```{r}
# For a partial matrix
coex <- extract.coex(object = obj,genes = c("ERCC-00017", "ERCC-00019", "ERCC-00024", "ERCC-00025", "ERCC-00028", "ERCC-00031"))
head(coex)
```


The next few lines are just to clean after compilation of the documentation.
```{r}
if (file.exists(paste(out_dir,t,".cotan.RDS", sep = ""))) {
  #Delete file if it exists
  file.remove(paste(out_dir,t,".cotan.RDS", sep = ""))
}
unlink(paste(out_dir,"cleaning", sep = ""),recursive = TRUE)

print(sessionInfo())
```