## ----style, echo=FALSE, results='hide', warning=FALSE, message=FALSE---------- BiocStyle::markdown() suppressPackageStartupMessages({ library(knitr) library(RAIDS) library(gdsfmt) }) set.seed(121444) ## ----graphMainSteps, echo=FALSE, fig.align="center", fig.cap="An overview of the genetic ancestry inference process.", out.width='130%', results='asis', warning=FALSE, message=FALSE---- knitr::include_graphics("MainSteps_v04.png") ## ----graphWrapper, echo=FALSE, fig.align="center", fig.cap="Final step - The wrapper function encapsulates multiple steps of the workflow.", out.width='120%', results='asis', warning=FALSE, message=FALSE---- knitr::include_graphics("MainSteps_Wrapper_v04.png") ## ----runExomeAncestry, echo=TRUE, eval=TRUE, collapse=FALSE, warning=FALSE, message=FALSE---- ############################################################################# ## Load required packages ############################################################################# library(RAIDS) library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ############################################################################# ## Load the information about the profile ############################################################################# data(demoPedigreeEx1) head(demoPedigreeEx1) ############################################################################# ## The population reference GDS file and SNV Annotation GDS file ## need to be located in the same directory. ## Note that the population reference GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ############################################################################# pathReference <- file.path(dataDir, "tests") fileGDS <- file.path(pathReference, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(pathReference, "ex1_good_small_1KG_Annot.gds") ############################################################################# ## A data frame containing general information about the study ## is also required. The data frame must have ## those 3 columns: "study.id", "study.desc", "study.platform" ############################################################################# studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) ############################################################################# ## The Sample SNP VCF files (one per sample) need ## to be all located in the same directory. ############################################################################# pathGeno <- file.path(dataDir, "example", "snpPileup") ############################################################################# ## Fix RNG seed to ensure reproducible results ############################################################################# set.seed(3043) ############################################################################# ## Select the profiles from the population reference GDS file for ## the synthetic data. ## Here we select 2 profiles from the simplified 1KG GDS for each ## subcontinental-level. ## Normally, we use 30 profile for each ## subcontinental-level but it is too big for the example. ## The 1KG files in this example only have 6 profiles for each ## subcontinental-level (for demo purpose only). ############################################################################# gds1KG <- snpgdsOpen(fileGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## GenomeInfoDb and BSgenome are required libraries to run this example if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] ########################################################################### ## The path where the Sample GDS files (one per sample) ## will be created needs to be specified. ########################################################################### pathProfileGDS <- file.path(tempdir(), "exampleDNA", "out.tmp") ########################################################################### ## The path where the result files will be created needs to ## be specified ########################################################################### pathOut <- file.path(tempdir(), "exampleDNA", "res.out") ## Example can only be run if the current directory is in writing mode if (!dir.exists(file.path(tempdir(), "exampleDNA"))) { dir.create(file.path(tempdir(), "exampleDNA")) dir.create(pathProfileGDS) dir.create(pathOut) ######################################################################### ## The wrapper function generates the synthetic dataset and uses it ## to selected the optimal parameters before calling the genetic ## ancestry on the current profiles. ## All important information, for each step, are saved in ## multiple output files. ## The 'genoSource' parameter has 2 options depending on how the ## SNP files have been generated: ## SNP VCF files have been generated: ## "VCF" or "generic" (other software) ## ######################################################################### runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, fileReferenceGDS=fileGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, genoSource="VCF") list.files(pathOut) list.files(file.path(pathOut, demoPedigreeEx1$Name.ID[1])) ####################################################################### ## The file containing the ancestry inference (SuperPop column) and ## optimal number of PCA component (D column) ## optimal number of neighbours (K column) ####################################################################### resAncestry <- read.csv(file.path(pathOut, paste0(demoPedigreeEx1$Name.ID[1], ".Ancestry.csv"))) print(resAncestry) ## Remove temporary files created for this demo unlink(pathProfileGDS, recursive=TRUE, force=TRUE) unlink(pathOut, recursive=TRUE, force=TRUE) unlink(file.path(tempdir(), "exampleDNA"), recursive=TRUE, force=TRUE) } } ## ----runRNAAncestry, echo=TRUE, eval=TRUE, collapse=FALSE, warning=FALSE, message=FALSE---- ############################################################################# ## Load required packages ############################################################################# library(RAIDS) library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ############################################################################# ## Load the information about the profile ############################################################################# data(demoPedigreeEx1) head(demoPedigreeEx1) ############################################################################# ## The population reference GDS file and SNV Annotation GDS file ## need to be located in the same directory. ## Note that the population reference GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ############################################################################# pathReference <- file.path(dataDir, "tests") fileGDS <- file.path(pathReference, "ex1_good_small_1KG.gds") fileAnnotGDS <- file.path(pathReference, "ex1_good_small_1KG_Annot.gds") ############################################################################# ## A data frame containing general information about the study ## is also required. The data frame must have ## those 3 columns: "study.id", "study.desc", "study.platform" ############################################################################# studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) ############################################################################# ## The Sample SNP VCF files (one per sample) need ## to be all located in the same directory. ############################################################################# pathGeno <- file.path(dataDir, "example", "snpPileupRNA") ############################################################################# ## Fix RNG seed to ensure reproducible results ############################################################################# set.seed(3043) ############################################################################# ## Select the profiles from the population reference GDS file for ## the synthetic data. ## Here we select 2 profiles from the simplified 1KG GDS for each ## subcontinental-level. ## Normally, we use 30 profile for each ## subcontinental-level but it is too big for the example. ## The 1KG files in this example only have 6 profiles for each ## subcontinental-level (for demo purpose only). ############################################################################# gds1KG <- snpgdsOpen(fileGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) ## GenomeInfoDb and BSgenome are required libraries to run this example if (requireNamespace("GenomeInfoDb", quietly=TRUE) && requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { ## Chromosome length information ## chr23 is chrX, chr24 is chrY and chrM is 25 chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] ############################################################################# ## The path where the Sample GDS files (one per sample) ## will be created needs to be specified. ############################################################################# pathProfileGDS <- file.path(tempdir(), "exampleRNA", "outRNA.tmp") ############################################################################# ## The path where the result files will be created needs to ## be specified ############################################################################# pathOut <- file.path(tempdir(), "exampleRNA", "resRNA.out") ## Example can only be run if the current directory is in writing mode if (!dir.exists(file.path(tempdir(), "exampleRNA"))) { dir.create(file.path(tempdir(), "exampleRNA")) dir.create(pathProfileGDS) dir.create(pathOut) ######################################################################### ## The wrapper function generates the synthetic dataset and uses it ## to selected the optimal parameters before calling the genetic ## ancestry on the current profiles. ## All important information, for each step, are saved in ## multiple output files. ## The 'genoSource' parameter has 2 options depending on how the ## SNP files have been generated: ## SNP VCF files have been generated: ## "VCF" or "generic" (other software) ######################################################################### runRNAAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, fileReferenceGDS=fileGDS, fileReferenceAnnotGDS=fileAnnotGDS, chrInfo=chrInfo, syntheticRefDF=dataRef, blockTypeID="GeneS.Ensembl.Hsapiens.v86", genoSource="VCF") list.files(pathOut) list.files(file.path(pathOut, demoPedigreeEx1$Name.ID[1])) ######################################################################### ## The file containing the ancestry inference (SuperPop column) and ## optimal number of PCA component (D column) ## optimal number of neighbours (K column) ######################################################################### resAncestry <- read.csv(file.path(pathOut, paste0(demoPedigreeEx1$Name.ID[1], ".Ancestry.csv"))) print(resAncestry) ## Remove temporary files created for this demo unlink(pathProfileGDS, recursive=TRUE, force=TRUE) unlink(pathOut, recursive=TRUE, force=TRUE) unlink(file.path(tempdir(), "example"), recursive=TRUE, force=TRUE) } } ## ----graphStep1, echo=FALSE, fig.align="center", fig.cap="Step 1 - Formatting the information from the population reference dataset (optional)", out.width='120%', results='asis', warning=FALSE, message=FALSE---- knitr::include_graphics("MainSteps_Step1_v04.png") ## ----sessionInfo, echo=FALSE-------------------------------------------------- sessionInfo()