## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)

## ----echo=TRUE, eval=FALSE----------------------------------------------------
#  
#  if(!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("survClust")

## -----------------------------------------------------------------------------
library(survClust)
library(survival)
library(BiocParallel)

#mutation data
uvm_dat[[1]][1:5,1:5]

#copy number data
uvm_dat[[2]][1:5,1:5]

#TCGA UVM clinical data
head(uvm_survdat)


## ----echo=TRUE, eval=TRUE-----------------------------------------------------

cv_rounds = 10

#function to do cross validation 
uvm_all_cvrounds<-function(kk){
    this.fold<-3
    fit<-list()
    for (i in seq_len(cv_rounds)){
        fit[[i]] <- cv_survclust(uvm_dat,uvm_survdat,kk,this.fold)
        print(paste0("finished ", i, " rounds for k= ", kk))
    }
    return(fit)
}


## ----echo=TRUE, eval=FALSE----------------------------------------------------
#  ptm <- Sys.time()
#  cv.fit<-bplapply(2:7, uvm_all_cvrounds)
#  ptm2 <- Sys.time()
#  
#  #> ptm
#  #[1] "2022-09-05 20:54:21 EDT"
#  #> ptm2
#  #[1] "2022-09-05 21:01:12 EDT"
#  
#  

## -----------------------------------------------------------------------------
lapply(uvm_dat, function(x) dim(x))

## -----------------------------------------------------------------------------

#for k=2, 1st round of cross validation
names(uvm_survClust_cv.fit[[1]][[1]])


## ----fig.width=8, fig.height=8, fig.cap= "survClust analysis of TCGA UVM mutation and Copy Number data"----

ss_stats <- getStats(uvm_survClust_cv.fit, kk=7, cvr=10)
plotStats(ss_stats, 2:7)

## ----message=FALSE, fig.cap= "survClust KM analysis for integrated TCGA UVM Mutation and Copy Number for k=4"----

k4 <- cv_voting(uvm_survClust_cv.fit, getDist(uvm_dat, uvm_survdat), pick_k=4)
table(k4)

plot(survfit(Surv(uvm_survdat[,1], uvm_survdat[,2])~k4), mark.time=TRUE, col=1:4)

## -----------------------------------------------------------------------------

mut_k4_test <- apply(uvm_dat[[1]],2,function(x) fisher.test(x,k4)$p.value)
head(sort(p.adjust(mut_k4_test)))

## ----echo = FALSE, fig.cap="TCGA UVM mutation features for k=4"---------------
htmltools::img(src = knitr::image_uri("uvm_mut_example.png"), 
               style = 'margin-left: auto;margin-right: auto')

## ----fig.width=5, fig.height=6, fig.cap= "TCGA UVM Copy Number k=4"-----------

cn_imagedata <- uvm_dat[[2]]
cn_imagedata[cn_imagedata < -1.5] <- -1.5
cn_imagedata[cn_imagedata > 1.5] <- 1.5

oo <- order(k4)
cn_imagedata <- cn_imagedata[oo,]
cn_imagedata <- cn_imagedata[,ncol(cn_imagedata):1]
#image(cn_imagedata,col=gplots::bluered(50),axes=F)

#image y labels - chr names
cnames <- colnames(cn_imagedata)
cnames <- unlist(lapply(strsplit(cnames, "\\."), function(x) x[1]))
tt <- table(cnames)
nn <- paste0("chr",1:22)

chr.labels <- rep(NA, length(cnames))

index <- 1
chr.labels[1] <- "1"

for(i in seq_len(length(nn)-1)) {
    index <- index + tt[nn[i]]
    chr.labels[index] <- gsub("chr","",nn[i+1])
}

idx <- which(!(is.na(chr.labels)))

image(cn_imagedata,col=gplots::bluered(50),axes=FALSE)

axis(2, at = 1 - (idx/length(cnames)), labels = chr.labels[idx], las=1, cex.axis=0.8)
abline(v = c(cumsum(prop.table(table(k4)))))
abline(h=c(0,1))

## -----------------------------------------------------------------------------

#function to do cross validation 
sim_cvrounds<-function(kk){
    this.fold<-3
    fit<-list()
    for (i in seq_len(cv_rounds)){
        fit[[i]] <- cv_survclust(simdat, simsurvdat,kk,this.fold)
        print(paste0("finished ", i, " rounds for k= ", kk))
    }
    return(fit)
}


ptm <- Sys.time()
sim_cv.fit<-bplapply(2:7, sim_cvrounds)
ptm2 <- Sys.time()

ptm
ptm2

## ----fig.width=8, fig.height=8, fig.cap= "survClust analysis of simulated dataset"----

ss_stats <- getStats(sim_cv.fit, kk=7, cvr=10)
plotStats(ss_stats, 2:7)

## ----message=FALSE, fig.cap= "survClust k=3 class labels KM analysis for simulated dataset "----

k3 <- cv_voting(sim_cv.fit, getDist(simdat, simsurvdat), pick_k=3)

sim_class_labels <- c(rep(1, 50), rep(2,50), rep(3,50))

table(k3, sim_class_labels)

plot(survfit(Surv(simsurvdat[,1], simsurvdat[,2]) ~ k3), mark.time=TRUE, col=1:3)

## -----------------------------------------------------------------------------

#function to do cross validation 
cvrounds_mut <- function(kk){
    this.fold<-3
    fit<-list()
    for (i in seq_len(cv_rounds)){
        fit[[i]] <- cv_survclust(uvm_mut_dat, uvm_survdat,kk,this.fold, type="mut")
        print(paste0("finished ", i, " rounds for k= ", kk))
    }
    return(fit)
}

#let's create a list object with just the mutation data 
uvm_mut_dat <- list()
uvm_mut_dat[[1]] <- uvm_dat[[1]]

ptm <- Sys.time()
uvm_mut_cv.fit<-bplapply(2:7, cvrounds_mut)
ptm2 <- Sys.time()


## ----fig.width=8, fig.height=8, fig.cap= "survClust analysis of TCGA UVM mutation data alone"----

ss_stats <- getStats(uvm_mut_cv.fit, kk=7, cvr=10)
plotStats(ss_stats, 2:7)

## ----fig.width=4, fig.height=4, message=FALSE, fig.cap= "survClust k=3 class labels KM analysis for TCGA UVM mutation data alone"----

k4 <- cv_voting(uvm_mut_cv.fit, getDist(uvm_mut_dat, uvm_survdat), pick_k=4)
plot(survfit(Surv(uvm_survdat[,1], uvm_survdat[,2]) ~ k4), mark.time=TRUE, col=2:5)

## -----------------------------------------------------------------------------

mut_k4_test <- apply(uvm_mut_dat[[1]],2,function(x) fisher.test(x,k4)$p.value)
head(sort(p.adjust(mut_k4_test)))

## ----eval=FALSE, echo=TRUE----------------------------------------------------
#  
#  # DO NOT RUN. Use provided dataset
#  #Process mutation maf data
#  #Download data from - https://gdc.cancer.gov/about-data/publications/pancanatlas
#  
#  maf <- data.table::fread("mc3.v0.2.8.PUBLIC.maf.gz", header = TRUE)
#  maf_filter <- maf %>% filter(FILTER == "PASS",
#                              Variant_Classification != "Silent")
#  
#  # few lines of code in tidyR to convert maf to a binary file
#  maf_binary <- maf_filter %>%
#      select(Tumor_Sample_Barcode, Hugo_Symbol) %>%
#      distinct() %>%
#      pivot_wider(names_from = "Hugo_Symbol",
#                  values_from = 'Hugo_Symbol',
#                  values_fill = 0, values_fn = function(x) 1)
#  
#  maf_binary$tcga_short <- substr(maf_binary$Tumor_Sample_Barcode, 1, 12)
#  
#  # Process clinical file
#  tcga_clin <- readxl::read_excel("TCGA-CDR-SupplementalTableS1.xlsx", sheet=1, col_names = TRUE)
#  
#  uvm_clin <- tcga_clin %>% filter(type == "UVM")
#  uvm_maf_binary <- maf_binary %>%
#      filter(tcga_short %in% uvm_clin$bcr_patient_barcode) %>%
#      select(-Tumor_Sample_Barcode)
#  rnames <- uvm_maf_binary$tcga_short
#  
#  uvm_maf <- uvm_maf_binary %>% select(-tcga_short) %>%
#      apply(., 2, as.numeric)
#  
#  # Remove singletons
#  gene_sum <- apply(uvm_maf,2,sum)
#  idx <- which(gene_sum > 1)
#  
#  uvm_maf <- uvm_maf[,idx]
#  rownames(uvm_maf) <- rnames
#  
#  
#  uvm_survdat <- uvm_clin %>% select(OS.time, OS) %>%
#      apply(., 2, as.numeric)
#  
#  rownames(uvm_survdat) <- uvm_clin$bcr_patient_barcode
#  
#  # process CN
#  library(cluster)#pam function for derive medoid
#  library(GenomicRanges) #interval overlap to remove CNV
#  library(iClusterPlus)
#  
#  seg <- read.delim(file="broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg", header=TRUE,sep="\t", as.is=TRUE)
#  
#  pp <- substr(seg$Sample,13,16)
#  seg.idx <- c(grep("-01A",pp),grep("-01B",pp),grep("-03A",pp))
#  
#  #only take tumors
#  seg.idx <- c(grep("-01A",pp),grep("-01B",pp))
#  seg <- seg[seg.idx,]
#  
#  seg$Sample <- substr(seg[,1],1,12)
#  
#  uvm_seg <- seg[seg$Sample %in% uvm_clin$bcr_patient_barcode,]
#  
#  colnames(uvm_seg) <- c("Sample", "Chromosome", "Start", "End", "Num_Probes", "Segment_Mean")
#  
#  # pass epsilon as 0.001 default or user
#  reducedMseg <- CNregions(ss_seg,epsilon=0.001,adaptive=FALSE,rmCNV=FALSE, cnv=NULL, frac.overlap=0.5, rmSmallseg=TRUE, nProbes=75)
#  
#  uvm_dat <- list(uvm_mut = uvm_maf, uvm_cn = uvm_seg)
#  

## ----echo=TRUE, eval=FALSE----------------------------------------------------
#  set.seed(112)
#  n1 <- 50 #class1
#  n2 <- 50 #class2
#  n3 <- 50 #class3
#  n <- n1+n2+n3
#  p <- 15 #survival related features (10%)
#  q <- 120 #noise
#  
#  #class1 ~ N(1.5,1), class2 ~ N(0,1), class3 ~ N(-1.5,1)
#  
#  sample_names <- paste0("S",1:n)
#  feature_names <- paste0("features", 1:n)
#  
#  #final matrix
#  x_big <- NULL
#  
#  ################
#  # sample 15 informant features
#  
#  #simulating class1
#  x1a <- matrix(rnorm(n1*p, 1.5, 1), ncol=p)
#  
#  #simulating class2
#  x2a <- matrix(rnorm(n2*p), ncol=p)
#  
#  
#  #simulating class3
#  x3a <- matrix(rnorm(n3*p, -1.5,1), ncol=p)
#  
#  #this concluded that part shaded in red of the matrix -
#  #corresponding to related to survival and molecularly distinct
#  xa <- rbind(x1a,x2a,x3a)
#  
#  ################
#  # sample 15 other informant features, but scramble them.
#  
#  permute.idx<-sample(1:length(sample_names),length(sample_names))
#  
#  x1b <- matrix(rnorm(n1*p, 1.5, 1), ncol=p)
#  x2b <- matrix(rnorm(n2*p), ncol=p)
#  x3b <- matrix(rnorm(n3*p, -1.5,1), ncol=p)
#  
#  #this concluded that part shaded in blue of the matrix -
#  #containing the molecular distinct features but not related to survival
#  xb <- rbind(x1b,x2b,x3b)
#  
#  
#  #this concludes the area shaded area in grey which corresponds to noise
#  xc <- matrix(rnorm(n*q), ncol=q)
#  
#  x_big <- cbind(xa,xb[permute.idx,], xc)
#  
#  rownames(x_big) <- sample_names
#  colnames(x_big) <- feature_names
#  simdat <- list()
#  simdat[[1]] <- x_big
#  
#  #the three classes will have a median survival of 4.5, 3.25 and 2 yrs respectively
#  set.seed(112)
#  med_surv_class1 <- log(2)/4.5
#  med_surv_class2 <- log(2)/3.25
#  med_surv_class3 <- log(2)/2
#  
#  surv_dist_class1 <- rexp(n1,rate=med_surv_class1)
#  censor_events_class1 <- runif(n1,0,10)
#  
#  surv_dist_class2 <- rexp(n2,rate=med_surv_class2)
#  censor_events_class2 <- runif(n2,0,10)
#  
#  surv_dist_class3 <- rexp(n3,rate=med_surv_class3)
#  censor_events_class3 <- runif(n3,0,10)
#  
#  surv_time_class1 <- pmin(surv_dist_class1,censor_events_class1)
#  surv_time_class2 <- pmin(surv_dist_class2,censor_events_class2)
#  surv_time_class3 <- pmin(surv_dist_class3,censor_events_class3)
#  
#  event <- c((surv_time_class1==surv_dist_class1),
#            (surv_time_class2==surv_dist_class2),
#            (surv_time_class3==surv_dist_class3))
#  
#  time <- c(surv_time_class1, surv_time_class2, surv_time_class3)
#  
#  survdat <- cbind(time, event)
#  
#  simsurvdat <- cbind(time, event)
#  rownames(simsurvdat) <- sample_names

## -----------------------------------------------------------------------------
sessionInfo()