## ---- eval = FALSE------------------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly=TRUE))
#      install.packages("BiocManager")
#  BiocManager::install(("TCGAbiolinksGUI.data")

## ---- eval = FALSE------------------------------------------------------------
#  BiocManager::install("BioinformaticsFMRP/TCGAbiolinksGUI.data")

## ---- eval = TRUE-------------------------------------------------------------
library(TCGAbiolinksGUI.data)

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  # Defining parameters
#  getGDCdisease <- function(){
#    projects <- TCGAbiolinks:::getGDCprojects()
#    projects <- projects[projects$id != "FM-AD",]
#    disease <-  projects$project_id
#    idx <- grep("disease_type",colnames(projects))
#    names(disease) <-  paste0(projects[[idx]], " (",disease,")")
#    disease <- disease[sort(names(disease))]
#    return(disease)
#  }

## -----------------------------------------------------------------------------
data(GDCdisease)
DT::datatable(as.data.frame(GDCdisease))

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  getMafTumors <- function(){
#    root <- "https://gdc-api.nci.nih.gov/data/"
#    maf <- fread("https://gdc-docs.nci.nih.gov/Data/Release_Notes/Manifests/GDC_open_MAFs_manifest.txt",
#                 data.table = FALSE, verbose = FALSE, showProgress = FALSE)
#    tumor <- unlist(lapply(maf$filename, function(x){unlist(str_split(x,"\\."))[2]}))
#    proj <- TCGAbiolinks:::getGDCprojects()
#  
#    disease <-  gsub("TCGA-","",proj$project_id)
#    idx <- grep("disease_type",colnames(proj))
#    names(disease) <-  paste0(proj[[idx]], " (",proj$project_id,")")
#    disease <- sort(disease)
#    ret <- disease[disease %in% tumor]
#    return(ret)
#  }

## -----------------------------------------------------------------------------
data(maf.tumor)
DT::datatable(as.data.frame(maf.tumor))

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  library(readr)
#  library(readxl)
#  library(dplyr)
#  library(caret)
#  library(randomForest)
#  library(doMC)
#  library(e1071)
#  
#  # Control random number generation
#  set.seed(210) # set a seed to RNG
#  
#  # register number of cores to be used for parallel evaluation
#  registerDoMC(cores = parallel::detectCores())

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  file <- "https://tcga-data.nci.nih.gov/docs/publications/lgggbm_2016/LGG.GBM.meth.txt"
#  if(!file.exists(basename(file))) downloader::download(file,basename(file))
#  LGG.GBM <- as.data.frame(readr::read_tsv(basename(file)))
#  rownames(LGG.GBM) <- LGG.GBM$Composite.Element.REF
#  idx <- grep("TCGA",colnames(LGG.GBM))
#  colnames(LGG.GBM)[idx] <- substr(colnames(LGG.GBM)[idx], 1, 12) # reduce complete barcode to sample identifier (first 12 characters)

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  file <- "http://www.cell.com/cms/attachment/2045372863/2056783242/mmc2.xlsx"
#  if(!file.exists(basename(file))) downloader::download(file,basename(file))
#  metadata <-  readxl::read_excel(basename(file), sheet = "S1A. TCGA discovery dataset", skip = 1)
#  DT::datatable(metadata[,c("Case",
#                            "Pan-Glioma DNA Methylation Cluster",
#                            "Supervised DNA Methylation Cluster",
#                            "IDH-specific DNA Methylation Cluster")])

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  file <- "http://zwdzwd.io/InfiniumAnnotation/current/EPIC/EPIC.manifest.hg38.rda"
#  if(!file.exists(basename(file))) downloader::download(file,basename(file))
#  load(basename(file))

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  file <- "https://tcga-data.nci.nih.gov/docs/publications/lgggbm_2015/PanGlioma_MethylationSignatures.xlsx"
#  if(!file.exists(basename(file))) downloader::download(file,basename(file))

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  sheet <- "1,300 pan-glioma tumor specific"
#  trainingset <- grep("mut|wt",unique(metadata$`Pan-Glioma DNA Methylation Cluster`),value = T)
#  trainingcol <- c("Pan-Glioma DNA Methylation Cluster")

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  plat <- "EPIC"
#  signature.probes <-  read_excel("PanGlioma_MethylationSignatures.xlsx",  sheet = sheet)  %>% pull(1)
#  samples <- dplyr::filter(metadata, `IDH-specific DNA Methylation Cluster` %in% trainingset)
#  RFtrain <- LGG.GBM[signature.probes, colnames(LGG.GBM) %in% as.character(samples$Case)] %>% na.omit

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  RFtrain <- RFtrain[!EPIC.manifest.hg38[rownames(RFtrain)]$MASK.general,]

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  trainingdata <- t(RFtrain)
#  trainingdata <- merge(trainingdata, metadata[,c("Case", trainingcol[model])], by.x=0,by.y="Case", all.x=T)
#  rownames(trainingdata) <- as.character(trainingdata$Row.names)
#  trainingdata$Row.names <- NULL

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  nfeat <- ncol(trainingdata)
#  trainingdata[,trainingcol] <-  factor(trainingdata[,trainingcol])
#  mtryVals <- floor(sqrt(nfeat))
#  for(i in floor(seq(sqrt(nfeat), nfeat/2, by = 2 * sqrt(nfeat)))) {
#    print(i)
#    x <- as.data.frame(tuneRF(trainingdata[,-grep(trainingcol[model],colnames(trainingdata))],
#                              trainingdata[,trainingcol[model]],
#                              stepFactor=2,
#                              plot= FALSE,
#                              mtryStart = i))
#    mtryVals <- unique(c(mtryVals, x$mtry[which (x$OOBError  == min(x$OOBError))]))
#  }
#  mtryGrid <- data.frame(.mtry=mtryVals)

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  fitControl <- trainControl(method = "repeatedcv",
#                             number = 10,
#                             verboseIter = TRUE,
#                             repeats = 10)

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  glioma.idh.model <- train(y = trainingdata[,trainingcol], # variable to be trained on
#                            x = trainingdata[,-grep(trainingcol,colnames(trainingdata))], # Daat labels
#                            data = trainingdata, # Data we are using
#                            method = "rf", # Method we are using
#                            trControl = fitControl, # How we validate
#                            ntree = 5000, # number of trees
#                            importance = TRUE,
#                            tuneGrid = mtryGrid, # set mtrys, the value that procuded a better model is used
#  )

## -----------------------------------------------------------------------------
data(glioma.idh.model)
glioma.idh.model

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  sheet <- "1,308 IDHmutant tumor specific "
#  trainingset <- grep("mut",unique(metadata$`IDH-specific DNA Methylation Cluster`),value = T)
#  trainingcol <- "IDH-specific DNA Methylation Cluster"

## -----------------------------------------------------------------------------
data(glioma.idhmut.model)
glioma.idhmut.model

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  sheet <- "163  probes that define each TC"
#  trainingset <- c("IDHmut-K1","IDHmut-K2")
#  trainingcol <- "Supervised DNA Methylation Cluster"

## -----------------------------------------------------------------------------
data("glioma.gcimp.model")
glioma.gcimp.model

## ---- eval=FALSE, include=TRUE------------------------------------------------
#  sheet <- "914 IDHwildtype tumor specific "
#  trainingset <- grep("wt",unique(metadata$`IDH-specific DNA Methylation Cluster`),value = T))
#  trainingcol <- "IDH-specific DNA Methylation Cluster"

## -----------------------------------------------------------------------------
data("glioma.idhwt.model")
glioma.idhwt.model

## -----------------------------------------------------------------------------
data("probes2rm")
head(probes2rm)

## ---- eval = FALSE------------------------------------------------------------
#  scraplinks <- function(url){
#      # Create an html document from the url
#      webpage <- xml2::read_html(url)
#      # Extract the URLs
#      url_ <- webpage %>%
#          rvest::html_nodes("a") %>%
#          rvest::html_attr("href")
#      # Extract the link text
#      link_ <- webpage %>%
#          rvest::html_nodes("a") %>%
#          rvest::html_text()
#      tb <- tibble::tibble(link = link_, url = url_)
#      tb <- tb %>% dplyr::filter(tb$link == "Download")
#      return(tb)
#  }
#  
#  library(htmltab)
#  library(dplyr)
#  library(tidyr)
#  root <- "http://linkedomics.org"
#  root.download <- file.path(root,"data_download")
#  linkedOmics <- htmltab(paste0(root,"/login.php#dataSource"))
#  linkedOmics <- linkedOmics %>% unite(col = "download_page","Cohort Source","Cancer ID", remove = FALSE,sep = "-")
#  linkedOmics.data <- plyr::adply(linkedOmics$download_page,1,function(project){
#      url <- file.path(root.download,project)
#      tryCatch({
#          tb <- cbind(tibble::tibble(project),htmltab(url),scraplinks(url))
#          tb$Link <- tb$link <- NULL
#          tb$dataset <- gsub(" \\(.*","",tb$`OMICS Dataset`)
#          tb
#      }, error = function(e) {
#          message(e)
#          return(NULL)
#      }
#      )
#  },.progress = "time",.id = NULL)
#  usethis::use_data(linkedOmics.data,internal = FALSE,compress = "xz")

## ----sessionInfo--------------------------------------------------------------
sessionInfo()