Table 1: The key functions in the SeqArray package.
The default setting for the analysis functions in the SeqArray package is serial implementation, but users can setup a cluster computing environment manually via seqParallelSetup() and distribute the calculations to multiple cores even 100 cluster nodes.
library(SeqArray)## Loading required package: gdsfmt# 1000 Genomes, Phase 1, chromosome 22
(gds.fn <- seqExampleFileName("KG_Phase1"))## [1] "/tmp/RtmpkGyHWJ/Rinst6c674a987c26/SeqArray/extdata/1KG_phase1_release_v3_chr22.gds"# open a GDS file
genofile <- seqOpen(gds.fn)
seqSummary(genofile)## File: /tmp/RtmpkGyHWJ/Rinst6c674a987c26/SeqArray/extdata/1KG_phase1_release_v3_chr22.gds
## Reference: GRCh37
## Ploidy: 2
## Number of samples: 1092
## Number of variants: 19773
## Chromosomes:
##    22  <NA> 
## 19773     0 
## Number of alleles per site:
##     2 
## 19773 
## Annotation, INFO variable(s):
## Annotation, FORMAT variable(s):
## Annotation, sample variable(s):
##  Sample
##  Family.ID
##  Population
##  Population.Description
##  Gender
##  Relationship
##  Unexpected.Parent.Child
##  Non.Paternity
##  Siblings
##  Grandparents
##  Avuncular
##  Half.Siblings
##  Unknown.Second.Order
##  Third.Order
##  Other.Comments# use 2 cores for the demonstration
seqParallelSetup(2)## Enable the computing cluster with 2 forked R processes.# numbers of alleles per site
table(seqNumAllele(genofile))## 
##     2 
## 19773# reference allele frequencies
summary(seqAlleleFreq(genofile, ref.allele=0))##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.9725  0.9963  0.9286  0.9991  1.0000# close the cluster environment
seqParallelSetup(FALSE)## Stop the computing cluster.The SNPRelate package is developed to accelerate two key computations in genome-wide association studies: principal component analysis (PCA) and relatedness analysis using identity-by-descent (IBD) measures. The kernels of SNPRelate are written in C/C++ and have been highly optimized for multi-core symmetric multiprocessing computer architectures. The genotypes in SeqArray format are converted to categorical dosages of reference alleles (0,1,2,NA), which are the data format used in the SNPRelate pacakge.
library(SNPRelate)## SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)It is suggested to use a pruned set of SNPs which are in approximate linkage equilibrium with each other to avoid the strong influence of SNP clusters in principal component analysis and relatedness analysis.
set.seed(1000)
# may try different LD thresholds for sensitivity analysis
snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2)## SNP pruning based on LD:
## Excluding 0 SNP on non-autosomes
## Excluding 122 SNPs (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 1092 samples, 19651 SNPs
##  Using 1 (CPU) core
##  Sliding window: 500000 basepairs, Inf SNPs
##  |LD| threshold: 0.2
## Chromosome 22: 37.66%, 7447/19773
## 7447 SNPs are selected in total.names(snpset)## [1] "chr22"head(snpset$chr22)  # snp.id## [1] 1 3 4 6 7 8# get all selected snp id
snpset.id <- unlist(snpset)# Run PCA
pca <- snpgdsPCA(genofile, snp.id=snpset.id, num.thread=2)## Principal Component Analysis (PCA) on SNP genotypes:
## Excluding 12326 SNPs on non-autosomes
## Excluding 0 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 1092 samples, 7447 SNPs
##  Using 2 (CPU) cores
## PCA: the sum of all working genotypes (0, 1 and 2) = 15598846
## PCA: Sun Nov 22 19:42:21 2015    0%
## PCA: Sun Nov 22 19:42:23 2015    100%
## PCA: Sun Nov 22 19:42:23 2015    Begin (eigenvalues and eigenvectors)
## PCA: Sun Nov 22 19:42:23 2015    End (eigenvalues and eigenvectors)# variance proportion (%)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))## [1] 1.22 0.84 0.29 0.28 0.27 0.26Population information are available:
pop.code <- factor(seqGetData(genofile, "sample.annotation/Population"))
head(pop.code)## [1] GBR GBR GBR GBR GBR GBR
## Levels: ASW CEU CHB CHS CLM FIN GBR IBS JPT LWK MXL PUR TSI YRIpopgroup <- list(
    EastAsia = c("CHB", "JPT", "CHS", "CDX", "KHV", "CHD"),
    European = c("CEU", "TSI", "GBR", "FIN", "IBS"),
    African  = c("ASW", "ACB", "YRI", "LWK", "GWD", "MSL", "ESN"),
    SouthAmerica = c("MXL", "PUR", "CLM", "PEL"),
    India = c("GIH", "PJL", "BEB", "STU", "ITU"))
colors <- sapply(levels(pop.code), function(x) {
    for (i in 1:length(popgroup)) {
        if (x %in% popgroup[[i]])
            return(names(popgroup)[i])
    }
    NA
    })
colors <- as.factor(colors)
legend.text <- sapply(levels(colors), function(x) paste(levels(pop.code)[colors==x], collapse=","))
legend.text##               African              EastAsia              European 
##         "ASW,LWK,YRI"         "CHB,CHS,JPT" "CEU,FIN,GBR,IBS,TSI" 
##          SouthAmerica 
##         "CLM,MXL,PUR"# make a data.frame
tab <- data.frame(sample.id = pca$sample.id,
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    Population = pop.code,
    stringsAsFactors = FALSE)
head(tab)##   sample.id         EV1        EV2 Population
## 1   HG00096 -0.01514785 0.03691694        GBR
## 2   HG00097 -0.01236666 0.02596987        GBR
## 3   HG00099 -0.01042858 0.03828143        GBR
## 4   HG00100 -0.01514046 0.02695951        GBR
## 5   HG00101 -0.01188685 0.02858924        GBR
## 6   HG00102 -0.01509719 0.03023233        GBR# draw
plot(tab$EV2, tab$EV1, pch=20, cex=0.75, main="1KG Phase 1, chromosome 22",
    xlab="eigenvector 2", ylab="eigenvector 1", col=colors[tab$Population])
legend("topleft", legend=legend.text, col=1:length(legend.text), pch=19, cex=0.75)For \(n\) study individuals, snpgdsIBS() can be used to create a \(n \times n\) matrix of genome-wide average IBS pairwise identities. To perform cluster analysis on the \(n \times n\) matrix of genome-wide IBS pairwise distances, and determine the groups by a permutation score:
set.seed(1000)
ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, num.thread=2))## Identity-By-State (IBS) analysis on SNP genotypes:
## Excluding 0 SNP on non-autosomes
## Excluding 122 SNPs (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 1092 samples, 19651 SNPs
##  Using 2 (CPU) cores
## IBS: the sum of all working genotypes (0, 1 and 2) = 39869601
## IBS: Sun Nov 22 19:42:25 2015    0%
## IBS: Sun Nov 22 19:42:28 2015    100%Here is the population information we have known:
# Determine groups of individuals by population information
rv <- snpgdsCutTree(ibs.hc, samp.group=as.factor(colors[pop.code]))## Create 4 groups.plot(rv$dendrogram, leaflab="none", main="1KG Phase 1, chromosome 22",
    edgePar=list(col=rgb(0.5,0.5,0.5,0.75), t.col="black"))
legend("bottomleft", legend=legend.text, col=1:length(legend.text), pch=19, cex=0.75, ncol=4)# close the GDS file
seqClose(genofile)An R/Bioconductor package SeqVarTools is available on Bioconductor, which defines S4 classes and methods for other common operations and analyses on SeqArray datasets.
sessionInfo()## R version 3.2.2 (2015-08-14)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.3 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] SNPRelate_1.4.0 SeqArray_1.10.6 gdsfmt_1.6.2   
## 
## loaded via a namespace (and not attached):
##  [1] AnnotationDbi_1.32.1       knitr_1.11                
##  [3] XVector_0.10.0             magrittr_1.5              
##  [5] GenomicAlignments_1.6.1    GenomicRanges_1.22.1      
##  [7] BiocGenerics_0.16.1        zlibbioc_1.16.0           
##  [9] IRanges_2.4.4              BiocParallel_1.4.0        
## [11] BSgenome_1.38.0            stringr_1.0.0             
## [13] GenomeInfoDb_1.6.1         tools_3.2.2               
## [15] SummarizedExperiment_1.0.1 parallel_3.2.2            
## [17] Biobase_2.30.0             DBI_0.3.1                 
## [19] lambda.r_1.1.7             futile.logger_1.4.1       
## [21] htmltools_0.2.6            yaml_2.1.13               
## [23] digest_0.6.8               rtracklayer_1.30.1        
## [25] formatR_1.2.1              futile.options_1.0.0      
## [27] S4Vectors_0.8.3            bitops_1.0-6              
## [29] biomaRt_2.26.1             RCurl_1.95-4.7            
## [31] RSQLite_1.0.0              evaluate_0.8              
## [33] rmarkdown_0.8.1            stringi_1.0-1             
## [35] GenomicFeatures_1.22.5     Biostrings_2.38.2         
## [37] Rsamtools_1.22.0           XML_3.98-1.3              
## [39] stats4_3.2.2               VariantAnnotation_1.16.3