## ----setup, cache = F, echo = FALSE-------------------------------------------
knitr::opts_chunk$set(error = TRUE,warning = FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#  
#  BiocManager::install("KnowSeq")
#  
#  library(KnowSeq)

## ---- eval=FALSE--------------------------------------------------------------
#  # Downloading one series from NCBI/GEO and one series from ArrayExpress
#  downloadPublicSeries(c("GSE74251"))
#  
#  # Using read.csv for NCBI/GEO files (read.csv2 for ArrayExpress files)
#  GSE74251csv <- read.csv("ReferenceFiles/GSE74251.csv")
#  
#  # Performing the alignment of the samples by using hisat2 aligner
#  rawAlignment(GSE74251csv,downloadRef=TRUE,downloadSamples=TRUE,BAMfiles = TRUE,
#  SAMfiles = TRUE,countFiles = TRUE,referenceGenome = 38, fromGDC = FALSE, customFA = "",
#  customGTF = "", tokenPath = "", manifest = "")

## ---- eval=FALSE--------------------------------------------------------------
#  # GDC portal controlled data processing with automatic raw data download
#  
#  rawAlignment(x, downloadRef=TRUE, downloadSamples=TRUE, fromGDC = TRUE,
#  tokenPath = "~/pathToToken", manifestPath = "~/pathToManifest")

## ----Preparing_data-----------------------------------------------------------
suppressMessages(library(KnowSeq))

dir <- system.file("extdata", package="KnowSeq")

# Using read.csv for NCBI/GEO files and read.csv2 for ArrayExpress files

GSE74251 <- read.csv(paste(dir,"GSE74251.csv",sep = "/"))
GSE81593 <- read.csv(paste(dir,"GSE81593.csv",sep = "/"))

# Creating the csv file with the information about the counts files location and the labels
Run <- GSE74251$Run
Path <- paste(dir,"/countFiles/",GSE74251$Run,sep = "")
Class <- rep("Tumor", length(GSE74251$Run))
GSE74251CountsInfo <-  data.frame(Run = Run, Path = Path, Class = Class)

Run <- GSE81593$Run
Path <- paste(dir,"/countFiles/",GSE81593$Run,sep = "")
Class <- rep("Control", length(GSE81593$Run))
GSE81593CountsInfo <-  data.frame(Run = Run, Path = Path, Class = Class)

mergedCountsInfo <- rbind(GSE74251CountsInfo, GSE81593CountsInfo)

write.csv(mergedCountsInfo, file = "mergedCountsInfo.csv")



## ---- eval=FALSE--------------------------------------------------------------
#  dir <- system.file("script", package="KnowSeq")
#  
#  # Code to execute the example script
#  source(paste(dir,"/KnowSeqExample.R",sep=""))
#  
#  
#  # Code to edit the example script
#  file.edit(paste(dir,"/KnowSeqExample.R",sep=""))
#  

## ----Merging_files------------------------------------------------------------
# Merging in one matrix all the count files indicated inside the CSV file

countsInformation <- countsToMatrix("mergedCountsInfo.csv", extension = "count")

# Exporting to independent variables the counts matrix and the labels

countsMatrix <- countsInformation$countsMatrix

labels <- countsInformation$labels

## ----Downloading_annotation---------------------------------------------------
# Downloading human annotation
myAnnotation <- getGenesAnnotation(rownames(countsMatrix))

# Downloading mus musculus annotation
myAnnotationMusMusculus <- getGenesAnnotation(rownames(countsMatrix), 
notHSapiens = TRUE,notHumandataset = "mm129s1svimj_gene_ensembl")


## ----Calculating_gene_expression----------------------------------------------
# Calculating gene expression values matrix using the counts matrix

expressionMatrix <- calculateGeneExpressionValues(countsMatrix,myAnnotation,
genesNames = TRUE)

## ----Plotting_boxplot---------------------------------------------------------
# Plotting the boxplot of the expression of each samples for all the genes

dataPlot(expressionMatrix,labels,mode = "boxplot", toPNG = TRUE,
toPDF = TRUE)

## ----Removing_batch_effect_combat---------------------------------------------
# Removing batch effect by using combat and known batch groups 

batchGroups <- c(1,1,1,1,2,2,1,2,1,2)

expressionMatrixCorrected <- batchEffectRemoval(expressionMatrix, labels, 
batchGroups = batchGroups, method = "combat")


## ----Removing_batch_effect_sva------------------------------------------------
# Calculating the surrogate variable analysis to remove batch effect

expressionMatrixCorrected <- batchEffectRemoval(expressionMatrix, labels, method = "sva")

## ---- DEGs without batch------------------------------------------------------
# Extracting DEGs that pass the imposed restrictions

DEGsInformation <- DEGsExtraction(expressionMatrixCorrected, labels,
lfc = 1.0, pvalue = 0.01, number = 100, cov = 1)

topTable <- DEGsInformation$DEG_Results$DEGs_Table

DEGsMatrix <- DEGsInformation$DEG_Results$DEGs_Matrix

## ----Plotting_orderded_boxplot------------------------------------------------
# Plotting the expression of the first 12 DEGs for each of the samples in an ordered way

dataPlot(DEGsMatrix[1:12,],labels,mode = "orderedBoxplot",toPNG = FALSE,toPDF = FALSE)

## ----Plotting_genes_boxplot---------------------------------------------------
# Plotting the expression of the first 12 DEGs separatelly for all the samples

dataPlot(DEGsMatrix[1:12,],labels,mode = "genesBoxplot",toPNG = FALSE,toPDF = FALSE)

## ----Plotting_heatmap---------------------------------------------------------
# Plotting the heatmap of the first 12 DEGs separatelly for all the samples

dataPlot(DEGsMatrix[1:12,],labels,mode = "heatmap",toPNG = FALSE,toPDF = FALSE)

## ----Assessing_machine_learning_step------------------------------------------
DEGsMatrixML <- t(DEGsMatrix)

# Feature selection process with mRMR and RF 
mrmrRanking <- featureSelection(DEGsMatrixML,labels,colnames(DEGsMatrixML), mode = "mrmr")
rfRanking <- featureSelection(DEGsMatrixML,labels,colnames(DEGsMatrixML), mode = "rf")
daRanking <- featureSelection(DEGsMatrixML,labels,colnames(DEGsMatrixML), mode = "da", disease = "breast")

# CV functions with k-NN, SVM and RF
results_cv_knn <- knn_trn(DEGsMatrixML,labels,names(mrmrRanking)[1:10],5)

results_cv_svm <- svm_trn(DEGsMatrixML,labels,rfRanking[1:10],5)

results_cv_rf <- rf_trn(DEGsMatrixML,labels,names(daRanking)[1:10],5)


## ----Plotting_ML_acc----------------------------------------------------------
# Plotting the accuracy of all the folds evaluated in the CV process

dataPlot(rbind(results_cv_knn$accuracy$meanAccuracy,results_cv_knn$accuracy$standardDeviation),mode = "classResults", legend = c("Mean Acc","Standard Deviation"),
main = "Mean Accuracy with k-NN", xlab = "Genes", ylab = "Accuracy")

## ----Plotting_ML_sens---------------------------------------------------------
# Plotting the sensitivity of all the folds evaluated in the CV process

dataPlot(rbind(results_cv_knn$sensitivity$meanSensitivity,results_cv_knn$sensitivity$standardDeviation),mode = "classResults", legend = c("Mean Sens","Standard Deviation"),
main = "Mean Sensitivity with k-NN", xlab = "Genes", ylab = "Sensitivity")

## ----Plotting_ML_spec---------------------------------------------------------
# Plotting the specificity of all the folds evaluated in the CV process

dataPlot(rbind(results_cv_knn$specificity$meanSpecificity,results_cv_knn$specificity$standardDeviation),mode = "classResults", legend = c("Mean Spec","Standard Deviation"),
main = "Mean Specificity with k-NN", xlab = "Genes", ylab = "Specificity")

## ----Plotting_ML_Heatmap------------------------------------------------------
# Plotting all the metrics depending on the number of used DEGs in the CV process

dataPlot(results_cv_knn,labels,mode = "heatmapResults")

## ----Plotting_ML_confs--------------------------------------------------------
# Plotting the confusion matrix with the sum of the confusion matrices 
# of each folds evaluated in the CV process

allCfMats <- results_cv_knn$cfMats[[1]]$table + results_cv_knn$cfMats[[2]]$table + 
results_cv_knn$cfMats[[3]]$table + results_cv_knn$cfMats[[4]]$table + 
results_cv_knn$cfMats[[5]]$table

dataPlot(allCfMats,labels,mode = "confusionMatrix")


## ----Assessing_ML_test--------------------------------------------------------
# Test functions with k-NN, SVM and RF
trainingMatrix <- DEGsMatrixML[c(1:4,6:9),]
trainingLabels <- labels[c(1:4,6:9)]
testMatrix <- DEGsMatrixML[c(5,10),]
testLabels <- labels[c(5,10)]

results_test_knn <- knn_test(trainingMatrix, trainingLabels, testMatrix,
testLabels, names(mrmrRanking)[1:10], bestK = results_cv_knn$bestK)

results_test_svm <- svm_test(trainingMatrix, trainingLabels, testMatrix,
testLabels, rfRanking[1:10], bestParameters = results_cv_svm$bestParameters)

results_test_rf <- rf_test(trainingMatrix, trainingLabels, testMatrix,
testLabels, colnames(DEGsMatrixML)[1:10], results_cv_rf$bestParameters)

## ----Plotting_ML_acc_test-----------------------------------------------------
# Plotting the accuracy achieved in the test process

dataPlot(results_test_knn$accVector,mode = "classResults",
main = "Accuracy with k-NN", xlab = "Genes", ylab = "Accuracy")

dataPlot(results_test_svm$accVector,mode = "classResults",
main = "Accuracy with SVM", xlab = "Genes", ylab = "Accuracy")

dataPlot(results_test_rf$accVector,mode = "classResults",
main = "Accuracy with RF", xlab = "Genes", ylab = "Accuracy")

## ----Retrieving_GOs, message=FALSE--------------------------------------------
# Retrieving the GO information from the three different ontologies

GOsList <- geneOntologyEnrichment(names(mrmrRanking)[1:10], geneType='GENE_SYMBOL', pvalCutOff=0.1)

## ----Downloading_pathways eval------------------------------------------------
# Retrieving the pathways related to the DEGs
paths <- DEGsToPathways(names(mrmrRanking)[1:10])
print(paths)


## ----Downloading_diseases-----------------------------------------------------
# Downloading the information about the DEGs related diseases

diseasesTargetValidation <- DEGsToDiseases(rownames(DEGsMatrix)[1:5], getEvidences = FALSE)

diseasesTargetValidation


## ---- eval=FALSE--------------------------------------------------------------
#  # Performing the automatic and intelligent KnowSeq report
#  knowseqReport(expressionMatrix,labels,'knowSeq-report',clasifAlgs=c('knn'),disease='breast-cancer', lfc = 2, pvalue = 0.001, geneOntology = T, getPathways = T)

## ----Session_info-------------------------------------------------------------
sessionInfo()