## ---- include = FALSE----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup---------------------------------------------------------------
library(crso)
### Load example dataset consisting of TCGA melanoma (SKCM) patients.
data(skcm)
list2env(skcm.list,envir=globalenv())
names(skcm.list) ### load D, P and cnv.dictionary
Q <- log10(P) ### Q is the penalty matrix derived from P

## ------------------------------------------------------------------------
rule.thresh <- .05 # Minimum percentage of samples covered by each rule in the rule library. (Default = .03)
spr <- 4 # Phase 1 random sets per rule (default = 40, recommend at least 40) 
trn <- 40 # Phase 1 stop criteria (default = 16, recommend at most 24)
k.max <- 6 # Maximum RS size (default is 12)
max.stored <- 1000 # Max RS stored per K in phases 2 and 3 (no default, recommend at least 10^4)
max.nrs.ee <- 1000 # Phase 2 max rs per K (default is 10^5, recommend at least 10^5)
max.compute <- 10*max.nrs.ee # Max raw im size for phase 2 (default is 10^6, recommend at least 10*max.nrs.ee)
max.nrs.borrow <- 1000 # Phase 3 max rs per K (default is 10^5, recommend at least 10^5)
filter.thresh <- .03 # minimum assignment threshold per rule set (default is .03)
### Generalized core parameters:
num.gc.iter <- 10 # Number GC iterations (default is 100)
num.gc.eval <- 100 # Rulesets evaluated per K per GC iter (default is 1000)

## ----phase 1-------------------------------------------------------------
set.seed(100)
rm.full <- buildRuleLibrary(D,rule.thresh) # build rule library
rm.ordered <- makePhaseOneOrderedRM(D,rm.full,spr,Q,trn,shouldPrint = TRUE) # run phase 1

## ----phase 2-------------------------------------------------------------
pool.sizes <- getPoolSizes(rm.ordered,k.max,max.nrs.ee,max.compute) 
### The pool size for each K is the number of rules considered for exhaustive evaluation in phase 2.
til.p2 <- makePhaseTwoImList(D,Q,rm.ordered,k.max,pool.sizes,max.stored,shouldPrint = TRUE) # Run phase 2
### TIL stands for top index list. The output of phase two is a list of top index matrices for each k.  Each index matrix contains the rule sets ordered by performance. For example the best performing rule set of size 3 will be the first row of the K.3 index matrix. For K=1 the index matrix is actually a vector.

## ----phase 3-------------------------------------------------------------
til.p3 <- makePhaseThreeImList(D,Q,rm.ordered,til.p2,pool.sizes,max.stored,max.nrs.borrow,shouldPrint = TRUE)
### Make TIL for phase 3 by updating phase two til to consider neighbor rule sets.

til.filtered <- makeFilteredImList(D,Q,rm.ordered,til.p3,filter.thresh)
### Filter the phase 3 results to only include rule sets for which every rule is assigned to a minimum percentage of samples, default is 3%
tpl.filtered <- evaluateListOfIMs(D,Q,rm.ordered,til.filtered) 
### Get top performance list (TPL), which contains the objective function score of all of the rule sets in til.filtered

## ----get core------------------------------------------------------------
best.rs.list <- getBestRsList(rm.ordered,tpl.filtered,til.filtered)
### This is a list of the best rule sets for all K

core.K <- getCoreK(D,rm.ordered,tpl.filtered,til.filtered) # Determine core K
core.ruleset <- getCoreRS(D,rm.ordered,tpl.filtered,til.filtered) # Extract core rule set
print(core.ruleset)

## ----gen core------------------------------------------------------------
list.subset.cores <- makeSubCoreList(D,Q,rm.ordered,til.filtered,num.gc.iter,num.gc.eval)
### list.subset.cores is a list of core rule set derived from subsampling iterations

gcr.df <- getGCRs(list.subset.cores) # Generalized core rules
print(gcr.df)
gcd.df <- getGCDs(list.subset.cores) # Generalized core duos
print(gcd.df)
gce.df <- getGCEs(list.subset.cores) # Generalized core events
print(gce.df)