### R code from vignette source 'vignettes/GWASTools/inst/doc/DataCleaning.Rnw' ### Encoding: UTF-8 ################################################### ### code chunk number 1: DataCleaning.Rnw:122-151 ################################################### library(GWASTools) library(GWASdata) # Load the SNP annotation (simple data frame) data(illumina_snp_annot) # Create a SnpAnnotationDataFrame snpAnnot <- SnpAnnotationDataFrame(illumina_snp_annot) # names of columns varLabels(snpAnnot) # data head(pData(snpAnnot)) # Add metadata to describe the columns meta <- varMetadata(snpAnnot) meta[c("snpID", "chromosome", "position", "rsID", "alleleA", "alleleB", "BeadSetID", "IntensityOnly", "tAA", "tAB", "tBB", "rAA", "rAB", "rBB"), "labelDescription"] <- c("unique integer ID for SNPs", paste("integer code for chromosome: 1:22=autosomes,", "23=X, 24=pseudoautosomal, 25=Y, 26=Mitochondrial, 27=Unknown"), "base pair position on chromosome (build 36)", "RS identifier", "alelleA", "alleleB", "BeadSet ID from Illumina", "1=no genotypes were attempted for this assay", "mean theta for AA cluster", "mean theta for AB cluster", "mean theta for BB cluster", "mean R for AA cluster", "mean R for AB cluster", "mean R for BB cluster") varMetadata(snpAnnot) <- meta ################################################### ### code chunk number 2: DataCleaning.Rnw:157-166 ################################################### snpID <- snpAnnot$snpID snpID <- getSnpID(snpAnnot) chrom <- snpAnnot[["chromosome"]] chrom <- getChromosome(snpAnnot) table(chrom) chrom <- getChromosome(snpAnnot, char=TRUE) table(chrom) position <- getPosition(snpAnnot) rsID <- getVariable(snpAnnot, "rsID") ################################################### ### code chunk number 3: DataCleaning.Rnw:189-199 ################################################### tmp <- snpAnnot[,c("snpID", "chromosome", "position")] snp <- getAnnotation(tmp) snp$flag <- sample(c(TRUE, FALSE), nrow(snp), replace=TRUE) pData(tmp) <- snp meta <- getMetadata(tmp) meta["flag", "labelDescription"] <- "flag" varMetadata(tmp) <- meta getVariableNames(tmp) varLabels(tmp)[4] <- "FLAG" rm(tmp) ################################################### ### code chunk number 4: DataCleaning.Rnw:227-254 ################################################### # Load the scan annotation (simple data frame) data(illumina_scan_annot) # Create a ScanAnnotationDataFrame scanAnnot <- ScanAnnotationDataFrame(illumina_scan_annot) # names of columns varLabels(scanAnnot) # data head(pData(scanAnnot)) # Add metadata to describe the columns meta <- varMetadata(scanAnnot) meta[c("scanID", "subjectID", "family", "father", "mother", "CoriellID", "race", "sex", "status", "genoRunID", "plate", "batch", "file"), "labelDescription"] <- c("unique integer ID for scans", "subject identifier (may have multiple scans)", "family identifier", "father identifier as subjectID", "mother identifier as subjectID", "Coriell subject identifier", "HapMap population group", "sex coded as M=male and F=female", "simulated case/control status" , "genotyping instance identifier", "plate containing samples processed together for genotyping chemistry", "simulated genotyping batch", "raw data file") varMetadata(scanAnnot) <- meta ################################################### ### code chunk number 5: DataCleaning.Rnw:261-266 ################################################### scanID <- scanAnnot$scanID scanID <- getScanID(scanAnnot) sex <- scanAnnot[["sex"]] sex <- getSex(scanAnnot) subjectID <- getVariable(scanAnnot, "subjectID") ################################################### ### code chunk number 6: DataCleaning.Rnw:329-339 ################################################### geno.nc.file <- "tmp.geno.nc" # get snp annotation data frame for function snp <- illumina_snp_annot[,c("snpID", "chromosome", "position")] ncdfCreate(ncdf.filename = geno.nc.file, snp.annotation = snp, variables = c("genotype"), array.name = "Human1Mv_C", genome.build = "36", n.samples = 3, precision = "single") ################################################### ### code chunk number 7: DataCleaning.Rnw:348-354 ################################################### # first 3 samples only scan.annotation <- illumina_scan_annot[1:3, c("scanID", "genoRunID", "file")] names(scan.annotation) <- c("scanID", "scanName", "file") # indicate which column of SNP annotation is referenced in data files snp.annotation <- illumina_snp_annot[,c("snpID", "rsID")] names(snp.annotation) <- c("snpID", "snpName") ################################################### ### code chunk number 8: DataCleaning.Rnw:361-366 ################################################### col.nums <- as.integer(c(1,2,12,13)) names(col.nums) <- c("snp", "sample", "a1", "a2") # Define a path to the raw data files which are read by # ncdfAddData to access the raw genotypic data path <- system.file("extdata", "illumina_raw_data", package="GWASdata") ################################################### ### code chunk number 9: DataCleaning.Rnw:375-406 ################################################### diag.geno.file <- "diag.geno.RData" diag.geno <- ncdfAddData(path = path, ncdf.filename = geno.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, sep.type = ",", skip.num = 11, col.total = 21, col.nums = col.nums, scan.name.in.file = 1, scan.start.index = 1, diagnostics.filename = diag.geno.file, verbose = FALSE) # Look at the values included in the "diag.geno" object which holds # all output from the function call names(diag.geno) # `read.file' is a vector indicating whether (1) or not (0) each file # specified in the `files' argument was read successfully table(diag.geno$read.file) # `row.num' is a vector of the number of rows read from each file table(diag.geno$row.num) # `sample.match' is a vector indicating whether (1) or not (0) # the sample name inside the raw text file matches that in the # sample annotation data.frame table(diag.geno$sample.match) # `snp.chk' is a vector indicating whether (1) or not (0) # the raw text file has the expected set of SNP names table(diag.geno$snp.chk) # `chk' is a vector indicating whether (1) or not (0) all previous # checks were successful and the data were written to the NetCDF file table(diag.geno$chk) ################################################### ### code chunk number 10: DataCleaning.Rnw:412-435 ################################################### check.geno.file <- "check.geno.RData" check.geno <- ncdfCheckGenotype(path = path, ncdf.filename = geno.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, sep.type = ",", skip.num = 11, col.total = 21, col.nums = col.nums, scan.name.in.file = 1, check.scan.index = 1:3, n.scans.loaded = 3, diagnostics.filename = check.geno.file, verbose = FALSE) # Look at the values included in the "check.geno" object which holds # all output from the function call names(check.geno) # 'snp.order' is a vector indicating whether (1) or not (0) the snp ids # are in the same order in each file. table(check.geno$snp.order) # 'geno.chk' is a vector indicating whether (1) or not (0) the genotypes # in the netCDF match the text file table(check.geno$geno.chk) ################################################### ### code chunk number 11: DataCleaning.Rnw:445-455 ################################################### (nc <- NcdfGenotypeReader(geno.nc.file)) nscan(nc) nsnp(nc) head(getScanID(nc)) head(getSnpID(nc)) head(getChromosome(nc)) head(getPosition(nc)) # genotypes for the first 3 samples and the first 5 SNPs getGenotype(nc, snp=c(1,5), scan=c(1,3)) close(nc) ################################################### ### code chunk number 12: DataCleaning.Rnw:484-494 ################################################### qxy.nc.file <- "tmp.qxy.nc" # get snp annotation data frame for function snp <- illumina_snp_annot[,c("snpID", "chromosome", "position")] ncdfCreate(ncdf.filename = qxy.nc.file, snp.annotation = snp, variables = c("quality","X","Y"), array.name = "Human1Mv_C", genome.build = "36", n.samples = 3, precision = "single") ################################################### ### code chunk number 13: DataCleaning.Rnw:501-509 ################################################### # first 3 samples only scan.annotation <- illumina_scan_annot[1:3, c("scanID", "genoRunID", "file")] names(scan.annotation) <- c("scanID", "scanName", "file") # indicate which column of SNP annotation is referenced in data files snp.annotation <- illumina_snp_annot[,c("snpID", "rsID")] names(snp.annotation) <- c("snpID", "snpName") col.nums <- as.integer(c(1,2,5,16,17)) names(col.nums) <- c("snp", "sample", "qs", "x", "y") ################################################### ### code chunk number 14: DataCleaning.Rnw:517-530 ################################################### diag.qxy.file <- "diag.qxy.RData" diag.qxy <- ncdfAddData(path = path, ncdf.filename = qxy.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, sep.type = ",", skip.num = 11, col.total = 21, col.nums = col.nums, scan.name.in.file = 1, scan.start.index = 1, diagnostics.filename = diag.qxy.file, verbose = FALSE) ################################################### ### code chunk number 15: DataCleaning.Rnw:536-554 ################################################### scan.annotation <- illumina_scan_annot[1:5, c("scanID", "genoRunID", "file")] names(scan.annotation) <- c("scanID", "scanName", "file") check.qxy.file <- "check.qxy.RData" check.qxy <- ncdfCheckIntensity(path = path, intenpath = path, ncdf.filename = qxy.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, sep.type = ",", skip.num = 11, col.total = 21, col.nums = col.nums, scan.name.in.file = 1, check.scan.index = 1:3, n.scans.loaded = 3, diagnostics.filename = check.qxy.file, verbose = FALSE) ################################################### ### code chunk number 16: DataCleaning.Rnw:563-577 ################################################### (nc <- NcdfIntensityReader(qxy.nc.file)) nscan(nc) nsnp(nc) head(getScanID(nc)) head(getSnpID(nc)) head(getChromosome(nc)) head(getPosition(nc)) # quality score for the first 3 samples and the first 5 SNPs getQuality(nc, snp=c(1,5), scan=c(1,3)) # X intensity for the first 3 samples and the first 5 SNPs getX(nc, snp=c(1,5), scan=c(1,3)) # Y intensity for the first 3 samples and the first 5 SNPs getY(nc, snp=c(1,5), scan=c(1,3)) close(nc) ################################################### ### code chunk number 17: DataCleaning.Rnw:657-667 ################################################### bl.nc.file <- "tmp.bl.nc" # get snp annotation data frame for function snp <- illumina_snp_annot[,c("snpID", "chromosome", "position")] ncdfCreate(ncdf.filename = bl.nc.file, snp.annotation = snp, variables = c("BAlleleFreq","LogRRatio"), array.name = "Human1Mv_C", genome.build = "36", n.samples = 3, precision = "single") ################################################### ### code chunk number 18: DataCleaning.Rnw:673-695 ################################################### # first 3 samples only scan.annotation <- illumina_scan_annot[1:3, c("scanID", "genoRunID", "file")] names(scan.annotation) <- c("scanID", "scanName", "file") # indicate which column of SNP annotation is referenced in data files snp.annotation <- illumina_snp_annot[,c("snpID", "rsID")] names(snp.annotation) <- c("snpID", "snpName") col.nums <- as.integer(c(1,2,20,21)) names(col.nums) <- c("snp", "sample", "ballelefreq", "logrratio") diag.bl.file <- "diag.bl.RData" diag.bl <- ncdfAddData(path = path, ncdf.filename = bl.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, sep.type = ",", skip.num = 11, col.total = 21, col.nums = col.nums, scan.name.in.file = 1, scan.start.index = 1, diagnostics.filename = diag.bl.file, verbose = FALSE) ################################################### ### code chunk number 19: DataCleaning.Rnw:702-706 ################################################### (nc <- NcdfIntensityReader(bl.nc.file)) getBAlleleFreq(nc, snp=c(1,5), scan=c(1,3)) getLogRRatio(nc, snp=c(1,5), scan=c(1,3)) close(nc) ################################################### ### code chunk number 20: DataCleaning.Rnw:710-713 ################################################### file.remove(geno.nc.file, qxy.nc.file, bl.nc.file) file.remove(diag.geno.file, diag.qxy.file, diag.bl.file) file.remove(check.geno.file, check.qxy.file) ################################################### ### code chunk number 21: DataCleaning.Rnw:729-738 ################################################### genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") nc <- NcdfGenotypeReader(genofile) # only NetCDF file genoData <- GenotypeData(nc) # with scan annotation genoData <- GenotypeData(nc, scanAnnot=scanAnnot) # with scan and SNP annotation genoData <- GenotypeData(nc, snpAnnot=snpAnnot, scanAnnot=scanAnnot) genoData ################################################### ### code chunk number 22: DataCleaning.Rnw:745-764 ################################################### nsnp(genoData) nscan(genoData) # scan annotation range(getScanID(genoData)) hasSex(genoData) table(getSex(genoData)) hasScanVariable(genoData, "subjectID") head(getScanVariable(genoData, "subjectID")) getScanVariableNames(genoData) # snp annotation range(getSnpID(genoData)) table(getChromosome(genoData, char=TRUE)) head(getPosition(genoData)) hasSnpVariable(genoData, "rsID") head(getSnpVariable(genoData, "rsID")) getSnpVariableNames(genoData) # genotypes getGenotype(genoData, snp=c(1,5), scan=c(1,5)) close(genoData) ################################################### ### code chunk number 23: DataCleaning.Rnw:771-790 ################################################### # quality score, X and X intensity qxyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata") nc <- NcdfIntensityReader(qxyfile) qxyData <- IntensityData(NcdfIntensityReader(qxyfile), snpAnnot=snpAnnot, scanAnnot=scanAnnot) qxyData getQuality(qxyData, snp=c(1,5), scan=c(1,5)) getX(qxyData, snp=c(1,5), scan=c(1,5)) getY(qxyData, snp=c(1,5), scan=c(1,5)) close(qxyData) # BAF/LRR blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata") nc <- NcdfIntensityReader(blfile) blData <- IntensityData(NcdfIntensityReader(blfile), snpAnnot=snpAnnot, scanAnnot=scanAnnot) blData getBAlleleFreq(blData, snp=c(1,5), scan=c(1,5)) getLogRRatio(blData, snp=c(1,5), scan=c(1,5)) close(blData) ################################################### ### code chunk number 24: DataCleaning.Rnw:837-850 ################################################### # open the netCDF file and create a GenotypeData object ncfile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") nc <- NcdfGenotypeReader(ncfile) # sex is required for this function, so we need the scan annotation genoData <- GenotypeData(nc, scanAnnot=scanAnnot) # Calculate the number of missing calls for each snp over all samples # for each sex separately miss <- missingGenotypeBySnpSex(genoData) # Examine the results names(miss) head(miss$missing.counts) miss$scans.per.sex head(miss$missing.fraction) ################################################### ### code chunk number 25: DataCleaning.Rnw:859-866 ################################################### # Make sure ordering matches snp annotation allequal(snpAnnot$snpID, as.numeric(names(miss$missing.fraction))) snpAnnot$missing.n1 <- miss$missing.fraction varMetadata(snpAnnot)["missing.n1", "labelDescription"] <- paste( "fraction of genotype calls missing over all samples", "except that females are excluded for Y chr SNPs") summary(snpAnnot$missing.n1) ################################################### ### code chunk number 26: DataCleaning.Rnw:873-876 ################################################### hist(snpAnnot$missing.n1, ylim=c(0,100), xlab="SNP missing call rate", main="Missing Call Rate for All Probes") ################################################### ### code chunk number 27: DataCleaning.Rnw:879-890 ################################################### # Find the number of SNPs with every call missing length(snpAnnot$missing.n1[snpAnnot$missing.n1 == 1]) # Fraction of autosomal SNPs with missing call rate < 0.05 x <- snpAnnot$missing.n1[snpAnnot$chromosome < 23] length(x[x < 0.05]) / length(x) # Fraction of X chromosome SNPs with missing call rate < 0.05 x <- snpAnnot$missing.n1[snpAnnot$chromosome == 23] length(x[x < 0.05]) / length(x) # Fraction of Y chromosome SNPs with missing call rate < 0.05 x <- snpAnnot$missing.n1[snpAnnot$chromosome == 25] length(x[x < 0.05]) / length(x) ################################################### ### code chunk number 28: DataCleaning.Rnw:908-922 ################################################### # Want to exclude all SNP probes with 100% missing call rate # Check on how many SNPs to exclude sum(snpAnnot$missing.n1 == 1) # Create a variable that contains the IDs of these SNPs to exclude snpexcl <- snpAnnot$snpID[snpAnnot$missing.n1 == 1] length(snpexcl) # Calculate the missing call rate per sample miss <- missingGenotypeByScanChrom(genoData, snp.exclude=snpexcl) names(miss) head(miss$missing.counts) head(miss$snps.per.chr) # Check to make sure that the correct number of SNPs were excluded sum(miss$snps.per.chr) nrow(snpAnnot) - sum(miss$snps.per.chr) ################################################### ### code chunk number 29: DataCleaning.Rnw:929-938 ################################################### head(miss$missing.fraction) # Check the ordering matches the sample annotation file allequal(names(miss$missing.fraction), scanAnnot$scanID) # Add the missing call rates vector to the sample annotation file scanAnnot$missing.e1 <- miss$missing.fraction varMetadata(scanAnnot)["missing.e1", "labelDescription"] <- paste( "fraction of genotype calls missing over all snps with missing.n1<1", "except that Y chr SNPs are excluded for females") summary(scanAnnot$missing.e1) ################################################### ### code chunk number 30: DataCleaning.Rnw:950-953 ################################################### hist(scanAnnot$missing.e1, xlab="Fraction of missing calls over all probes", main="Histogram of Sample Missing Call Rate for all Samples") ################################################### ### code chunk number 31: DataCleaning.Rnw:956-962 ################################################### # Look at missing.e1 for males summary(scanAnnot$missing.e1[scanAnnot$sex == "M"]) # Look at missing.e1 for females summary(scanAnnot$missing.e1[scanAnnot$sex == "F"]) # Number of samples with missing call rate > 5% sum(scanAnnot$missing.e1 > 0.05) ################################################### ### code chunk number 32: DataCleaning.Rnw:972-994 ################################################### auto <- colnames(miss$missing.counts) %in% 1:22 missa <- rowSums(miss$missing.counts[,auto]) / sum(miss$snps.per.chr[auto]) summary(missa) missx <- miss$missing.counts[,"X"] / miss$snps.per.chr["X"] summary(missx) # check they match sample annotation file allequal(names(missa), scanAnnot$scanID) allequal(names(missx), scanAnnot$scanID) # Add these separate sample missing call rates to the sample # annotation scanAnnot$miss.e1.auto <- missa scanAnnot$miss.e1.xchr <- missx # Order scanAnnot by missing.e1 so duplicate subjectIDs # with a higher missing rate are marked as duplicates scanAnnot <- scanAnnot[order(scanAnnot$subjectID, scanAnnot$missing.e1),] scanAnnot$duplicated <- duplicated(scanAnnot$subjectID) table(scanAnnot$duplicated, useNA="ifany") # Put scanAnnot back in scanID order; this is very important!! scanAnnot <- scanAnnot[order(scanAnnot$scanID),] allequal(scanAnnot$scanID, sort(scanAnnot$scanID)) varMetadata(scanAnnot)["duplicated", "labelDescription"] <- "TRUE for duplicate scan with higher missing.e1" ################################################### ### code chunk number 33: DataCleaning.Rnw:1011-1021 ################################################### # Find the samples with missing.e1 > .05 and make a vector of # scanID to exclude from the calculation scan.exclude <- scanAnnot$scanID[scanAnnot$missing.e1 > 0.05] # Call missingGenotypeBySnpSex and save the output miss <- missingGenotypeBySnpSex(genoData, scan.exclude=scan.exclude) snpAnnot$missing.n2 <- miss$missing.fraction varMetadata(snpAnnot)["missing.n2", "labelDescription"] <- paste( "fraction of genotype calls missing over all samples with missing.e1<0.05", "except that females are excluded for Y chr SNPs") summary(snpAnnot$missing.n2) ################################################### ### code chunk number 34: DataCleaning.Rnw:1031-1041 ################################################### # Create a vector of the SNPs to exclude. snpexcl <- snpAnnot$snpID[snpAnnot$missing.n2 >= 0.05] length(snpexcl) miss <- missingGenotypeByScanChrom(genoData, snp.exclude=snpexcl) # Add the missing call rates vector to the sample annotation file scanAnnot$missing.e2 <- miss$missing.fraction varMetadata(scanAnnot)["missing.e2", "labelDescription"] <- paste( "fraction of genotype calls missing over all snps with missing.n2<0.05", "except that Y chr SNPs are excluded for females") summary(scanAnnot$missing.e2) ################################################### ### code chunk number 35: DataCleaning.Rnw:1047-1050 ################################################### hist(scanAnnot$missing.e2, xlab="Fraction of missing calls over all probes with missing call rate < 0.05", main="Histogram of Sample Missing Call Rate for all Samples") ################################################### ### code chunk number 36: DataCleaning.Rnw:1063-1067 ################################################### varLabels(scanAnnot) # Check how many batches exist and how many samples are in each batch length(unique(scanAnnot$batch)) table(scanAnnot$batch, useNA="ifany") ################################################### ### code chunk number 37: DataCleaning.Rnw:1070-1074 ################################################### # Plot the distribution of the number of samples per batch. barplot(table(scanAnnot$batch), ylab="Number of Samples", xlab="Batch", main="Distribution of Samples per Batch") ################################################### ### code chunk number 38: DataCleaning.Rnw:1077-1086 ################################################### # Examine the mean missing call rate per batch for all SNPs batches <- unique(scanAnnot$batch) bmiss <- rep(NA,length(batches)); names(bmiss) <- batches bn <- rep(NA,length(batches)); names(bn) <- batches for(i in 1:length(batches)) { x <- scanAnnot$missing.e1[is.element(scanAnnot$batch, batches[i])] bmiss[i] <- mean(x) bn[i] <- length(x) } ################################################### ### code chunk number 39: DataCleaning.Rnw:1097-1099 ################################################### y <- lm(bmiss ~ bn) anova(y) ################################################### ### code chunk number 40: DataCleaning.Rnw:1102-1106 ################################################### plot(bn, bmiss, xlab="Number of samples per batch", ylab="Mean missing call rate", main="Mean Missing Call Rate vs\nSamples per Batch") abline(y$coefficients) ################################################### ### code chunk number 41: DataCleaning.Rnw:1127-1135 ################################################### res <- batchChisqTest(genoData, batchVar="batch", return.by.snp=TRUE) close(genoData) # chi-square values for each SNP dim(res$chisq) # genomic inflation factor res$lambda # average chi-square test statistic for each of the batches res$mean.chisq ################################################### ### code chunk number 42: DataCleaning.Rnw:1140-1158 ################################################### x <- table(scanAnnot$race, useNA="ifany") x x[1] / sum(x) x[2] / sum(x) x <- table(scanAnnot$race, scanAnnot$batch) x # Run an approximate chi-square test to see if there are ethnic effects chisq <- chisq.test(x) chisq$p.value # Calculate the fraction of samples in each batch that are CEU batches <- unique(scanAnnot$batch) eth <- rep(NA,length(batches)); names(eth) <- sort(batches) for(i in 1:length(batches)){ x <- scanAnnot$race[is.element(scanAnnot$batch, batches[i])] xl <- length(x[x == "CEU"]) eth[i] <- xl / length(x) } allequal(names(eth), names(res$mean.chisq)) ################################################### ### code chunk number 43: DataCleaning.Rnw:1161-1168 ################################################### # Plot the average Chi-Square test statistic against the # fraction of samples that are CEU plot(eth, res$mean.chisq, xlab="Fraction of CEU Samples per Batch", ylab="Average Chi-square Test Statistic", main="Fraction of CEU Samples per Batch vs Average Chi-square Test Statistic") abline(v=mean(eth), lty=2, col="red") ################################################### ### code chunk number 44: DataCleaning.Rnw:1199-1207 ################################################### qxyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata") qualNC <- NcdfIntensityReader(qxyfile) qualData <- IntensityData(qualNC, scanAnnot=scanAnnot) genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") genoNC <- NcdfGenotypeReader(genofile) genoData <- GenotypeData(genoNC, scanAnnot=scanAnnot) qual.results <- qualityScoreByScan(qualData, genoData) close(qualData) ################################################### ### code chunk number 45: DataCleaning.Rnw:1214-1216 ################################################### hist(qual.results[,"median.quality"], main="Median Genotype Quality Scores of Samples", xlab="Median Quality") ################################################### ### code chunk number 46: DataCleaning.Rnw:1247-1254 ################################################### blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata") blNC <- NcdfIntensityReader(blfile) blData <- IntensityData(blNC, scanAnnot=scanAnnot) nbins <- rep(12, 3) slidingBAF12 <- sdByScanChromWindow(blData, genoData, nbins=nbins) names(slidingBAF12) dim(slidingBAF12[["21"]]) ################################################### ### code chunk number 47: DataCleaning.Rnw:1262-1265 ################################################### sds.chr <- meanSdByChromWindow(slidingBAF12, scanAnnot$sex) sds.chr[["21"]] sds.chr[["X"]] ################################################### ### code chunk number 48: DataCleaning.Rnw:1271-1275 ################################################### res12bin4sd <- findBAFvariance(sds.chr, slidingBAF12, scanAnnot$sex, sd.threshold=4) head(res12bin4sd) table(res12bin4sd[, "chromosome"]) ################################################### ### code chunk number 49: DataCleaning.Rnw:1282-1287 ################################################### scanID <- as.integer(res12bin4sd[, "scanID"]) chrom <- as.integer(res12bin4sd[, "chromosome"]) chrom[res12bin4sd[, "chromosome"] == "X"] <- 23 bincode <- paste("Bin", res12bin4sd[, "bin"], sep = " ") chromIntensityPlot(blData, scanID, chrom, info=bincode, ideogram=FALSE) ################################################### ### code chunk number 50: DataCleaning.Rnw:1290-1291 ################################################### close(blData) ################################################### ### code chunk number 51: DataCleaning.Rnw:1317-1321 ################################################### miss <- missingGenotypeByScanChrom(genoData) miss.rate <- t(apply(miss$missing.counts, 1, function(x) { x / miss$snps.per.chr})) miss.rate <- as.data.frame(miss.rate) ################################################### ### code chunk number 52: DataCleaning.Rnw:1325-1328 ################################################### cols <- names(miss.rate) %in% c(1:22, "X", "XY") boxplot(miss.rate[,cols], main="Missingness by Chromosome", ylab="Proportion Missing", xlab="Chromosome") ################################################### ### code chunk number 53: DataCleaning.Rnw:1331-1334 ################################################### boxplot(miss.rate$X ~ scanAnnot$sex, main="X Chromosome Missingness by Sex", ylab="Proportion Missing") ################################################### ### code chunk number 54: DataCleaning.Rnw:1341-1353 ################################################### # Calculate heterozygosity by scan by chromosome het.results <- hetByScanChrom(genoData) close(genoData) # Ensure heterozygosity results are ordered correctly allequal(scanAnnot$scanID, rownames(het.results)) # Write autosomal and X chr heterozygosity to sample annot scanAnnot$het.A <- het.results[,"A"] scanAnnot$het.X <- het.results[,"X"] varMetadata(scanAnnot)["het.A", "labelDescription"] <- "fraction of heterozygotes for autosomal SNPs" varMetadata(scanAnnot)["het.X", "labelDescription"] <- "fraction of heterozygotes for X chromosome SNPs" ################################################### ### code chunk number 55: DataCleaning.Rnw:1361-1363 ################################################### boxplot(scanAnnot$het.A ~ scanAnnot$race, main="Autosomal Heterozygosity") ################################################### ### code chunk number 56: DataCleaning.Rnw:1366-1369 ################################################### female <- scanAnnot$sex == "F" boxplot(scanAnnot$het.X[female] ~ scanAnnot$race[female], main="X Chromosome Heterozygosity in Females") ################################################### ### code chunk number 57: DataCleaning.Rnw:1406-1411 ################################################### qxyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata") intenNC <- NcdfIntensityReader(qxyfile) inten.by.chrom <- meanIntensityByScanChrom(intenNC) close(intenNC) names(inten.by.chrom) ################################################### ### code chunk number 58: DataCleaning.Rnw:1419-1429 ################################################### mninten <- inten.by.chrom[[1]] # mean intensities dim(mninten) # Check to be sure sample ordering is consistent allequal(scanAnnot$scanID, rownames(mninten)) # Assign each sex a color xcol <- rep(NA, nrow(scanAnnot)) xcol[scanAnnot$sex == "M"] <- "blue" xcol[scanAnnot$sex == "F"] <- "red" nx <- sum(snpAnnot$chromosome == 23) ny <- sum(snpAnnot$chromosome == 25) ################################################### ### code chunk number 59: DataCleaning.Rnw:1439-1471 ################################################### #All intensities x1 <-mninten[,"X"]; y1 <- mninten[,"Y"] main1 <- "Mean X vs \nMean Y Chromosome Intensity" #Het on X vs X intensity x2 <- mninten[,"X"]; y2 <- scanAnnot$het.X main2 <- "Mean X Chromosome Intensity vs Mean X Chromosome Heterozygosity" # Het on X vs Y intensity y3 <- mninten[,"Y"]; x3 <- scanAnnot$het.X main3 <- "Mean X Chromosome Heterozygosity vs Mean Y Chromosome Intensity" # X vs A het x4 <- scanAnnot$het.A[scanAnnot$sex == "F"] y4 <- scanAnnot$het.X[scanAnnot$sex == "F"] main4 <- "Mean Autosomal Heterozygosity vs Mean X Chromosome Heterozygosity" cols <- c("blue","red") mf <- c("male", "female") xintenlab <- paste("X intensity (n=", nx, ")", sep="") yintenlab <- paste("Y intensity (n=", ny, ")", sep="") pdf("DataCleaning-sex.pdf") par(mfrow=c(2,2)) plot(x1, y1, xlab=xintenlab, ylab=yintenlab, main=main1, col=xcol, cex.main=0.8) legend("topright",mf,col=cols,pch=c(1,1)) plot(x2, y2, col=xcol, xlab=xintenlab, ylab="X heterozygosity", main=main2, cex.main=0.8) plot(x3, y3, col=xcol, ylab=yintenlab, xlab="X heterozygosity", main=main3, cex.main=0.8) plot(x4,y4, col="red", xlab="Autosomal heterozygosity", ylab="X heterozygosity", main=main4, cex.main=0.8) dev.off() ################################################### ### code chunk number 60: DataCleaning.Rnw:1504-1507 ################################################### ncfile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") gdsfile <- "tmp_geno.gds" convertNcdfGds(ncfile, gdsfile) ################################################### ### code chunk number 61: DataCleaning.Rnw:1515-1519 ################################################### snp.auto <- pData(snpAnnot)[snpAnnot$chromosome < 23, ] snp.auto.nonmiss <- snp.auto[snp.auto$missing.n1 < 0.05, ] snp.sel <- snp.auto.nonmiss$snpID length(snp.sel) ################################################### ### code chunk number 62: DataCleaning.Rnw:1528-1537 ################################################### sample.sel <- scanAnnot$scanID[scanAnnot$race == "CEU"] length(sample.sel) library(SNPRelate) gdsobj <- openfn.gds(gdsfile) ibdobj <- snpgdsIBDMLE(gdsobj, sample.id=sample.sel, snp.id=snp.sel, method="EM") closefn.gds(gdsobj) names(ibdobj) dim(ibdobj$k0) ibdobj$k0[1:5,1:5] ################################################### ### code chunk number 63: DataCleaning.Rnw:1548-1554 ################################################### ped <- pData(scanAnnot)[scanAnnot$race == "CEU", c("family", "subjectID", "father", "mother", "sex")] dim(ped) names(ped) <- c("family", "individ", "father", "mother", "sex") ped[1:5, ] (chk <- pedigreeCheck(ped)) ################################################### ### code chunk number 64: DataCleaning.Rnw:1560-1563 ################################################### dups <- chk$duplicates uni.ped <- pedigreeDeleteDuplicates(ped, dups) (chk <- pedigreeCheck(uni.ped)) ################################################### ### code chunk number 65: DataCleaning.Rnw:1568-1574 ################################################### ni <- chk$parent.no.individ.entry parent <- data.frame(family=ni$family, individ=ni$parentID, father=0, mother=0, sex="F", stringsAsFactors=FALSE) ped.complete <- rbind(uni.ped, parent) (chk <- pedigreeCheck(ped.complete)) ################################################### ### code chunk number 66: DataCleaning.Rnw:1580-1587 ################################################### subf <- chk$subfamilies.ident table(subf$family) subf.ids <- subf$individ[subf$subfamily == 2] newfam <- ped.complete$individ %in% subf.ids ped.complete$family[newfam] <- ped.complete$family[newfam] + 10000 table(ped.complete$family) pedigreeCheck(ped.complete) ################################################### ### code chunk number 67: DataCleaning.Rnw:1597-1602 ################################################### rels <- pedigreePairwiseRelatedness(ped.complete) length(rels$inbred.fam) relprs <- rels$relativeprs relprs[1:5,] table(relprs$relation) ################################################### ### code chunk number 68: DataCleaning.Rnw:1612-1634 ################################################### samp <- pData(scanAnnot)[,c("scanID", "subjectID")] samp <- samp[match(ibdobj$sample.id, samp$scanID),] names(samp) <- c("scanID", "Individ") ## snpgdsIBDSelection expects character sample.id ibdobj$sample.id <- as.character(ibdobj$sample.id) ibd <- snpgdsIBDSelection(ibdobj, kinship.cutoff=1/32) ibd <- merge(ibd, samp, by.x="ID1", by.y="scanID") ibd <- merge(ibd, samp, by.x="ID2", by.y="scanID", suffixes=c("1","2")) ibd$ii <- paste(pmin(ibd$Individ1, ibd$Individ2), pmax(ibd$Individ1, ibd$Individ2)) relprs$ii <- paste(pmin(relprs$Individ1, relprs$Individ2), pmax(relprs$Individ1, relprs$Individ2)) ibd <- merge(ibd, relprs[,c("ii","relation")], all.x=TRUE) names(ibd)[names(ibd) == "relation"] <- "exp.rel" ibd$exp.rel[ibd$Individ1 == ibd$Individ2] <- "Dup" ibd$exp.rel[is.na(ibd$exp.rel)] <- "U" table(ibd$exp.rel, useNA="ifany") # assign observed relationships ibd$obs.rel <- ibdAssignRelatedness(ibd$k0, ibd$k1) table(ibd$obs.rel, useNA="ifany") ################################################### ### code chunk number 69: DataCleaning.Rnw:1641-1643 ################################################### ibdPlot(ibd$k0, ibd$k1, relation=ibd$exp.rel, main="HapMap CEU IBD Estimates \nEM Algorithm") ################################################### ### code chunk number 70: DataCleaning.Rnw:1662-1691 ################################################### # Allele frequency calculation in YRI samples # open the NetCDF file and create a GenotypeData object nc <- NcdfGenotypeReader(ncfile) genoData <- GenotypeData(nc, snpAnnot=snpAnnot, scanAnnot=scanAnnot) # sample selection - unduplicated YRI samples sample.excl <- scanAnnot$scanID[scanAnnot$race != "YRI" | scanAnnot$duplicated] length(sample.excl) afreq <- alleleFrequency(genoData, scan.exclude=sample.excl) close(genoData) # Add allele frequency to SNP annotation afreq <- afreq[,"all"] # males and females combined snpAnnot$A.freq.yri <- afreq varMetadata(snpAnnot)["A.freq.yri", "labelDescription"] <- "frequency of allele A in YRI subjects" # Initial selection: autosomal SNPs with low missing call rate and MAF>0 init.sel <- snpAnnot$missing.n1 < 0.05 & snpAnnot$chromosome < 23 & afreq > 0 & afreq < 1 sum(init.sel) # Call the function to select SNPs no closer than 15000 bases set.seed(1001) n <- 15000 rsnp2 <- apartSnpSelection(snpAnnot$chromosome, snpAnnot$position, min.dist=n, init.sel=init.sel) sum(rsnp2) # Add the SNP selection to the SNP annotation file snpAnnot$r15kb <- rsnp2 varMetadata(snpAnnot)["r15kb", "labelDescription"] <- paste( "Logical indicator for autosomal SNPs with missing.n1<0.05", "and YRI MAF>0, at least 15 kb apart") ################################################### ### code chunk number 71: DataCleaning.Rnw:1697-1704 ################################################### sample.sel <- scanAnnot$scanID[scanAnnot$race == "YRI"] snp.sel <- snpAnnot$snpID[snpAnnot$r15kb] gdsobj <- openfn.gds(gdsfile) ibdobj <- snpgdsIBDMoM(gdsobj, sample.id=sample.sel, snp.id=snp.sel) closefn.gds(gdsobj) dim(ibdobj$k0) ibdobj$k0[1:5,1:5] ################################################### ### code chunk number 72: DataCleaning.Rnw:1714-1720 ################################################### ped <- pData(scanAnnot)[scanAnnot$race == "YRI", c("family", "subjectID", "father", "mother", "sex")] dim(ped) names(ped) <- c("family", "individ", "father", "mother", "sex") ped[1:5, ] (chk <- pedigreeCheck(ped)) ################################################### ### code chunk number 73: DataCleaning.Rnw:1727-1730 ################################################### dups <- chk$duplicates uni.ped <- pedigreeDeleteDuplicates(ped, dups) (chk <- pedigreeCheck(uni.ped)) ################################################### ### code chunk number 74: DataCleaning.Rnw:1736-1740 ################################################### subf <- chk$subfamilies.ident uni.ped <- uni.ped[!(uni.ped$family %in% subf$family),] table(uni.ped$family) pedigreeCheck(uni.ped) ################################################### ### code chunk number 75: DataCleaning.Rnw:1746-1774 ################################################### rels <- pedigreePairwiseRelatedness(uni.ped) length(rels$inbred.fam) relprs <- rels$relativeprs relprs[1:5,] table(relprs$relation) samp <- pData(scanAnnot)[,c("scanID", "subjectID")] samp <- samp[match(ibdobj$sample.id, samp$scanID),] names(samp) <- c("scanID", "Individ") ## snpgdsIBDSelection expects character sample.id ibdobj$sample.id <- as.character(ibdobj$sample.id) ibd <- snpgdsIBDSelection(ibdobj, kinship.cutoff=1/32) ibd <- merge(ibd, samp, by.x="ID1", by.y="scanID") ibd <- merge(ibd, samp, by.x="ID2", by.y="scanID", suffixes=c("1","2")) ibd$ii <- paste(pmin(ibd$Individ1, ibd$Individ2), pmax(ibd$Individ1, ibd$Individ2)) relprs$ii <- paste(pmin(relprs$Individ1, relprs$Individ2), pmax(relprs$Individ1, relprs$Individ2)) ibd <- merge(ibd, relprs[,c("ii","relation")], all.x=TRUE) names(ibd)[names(ibd) == "relation"] <- "exp.rel" ibd$exp.rel[ibd$Individ1 == ibd$Individ2] <- "Dup" ibd$exp.rel[is.na(ibd$exp.rel)] <- "U" table(ibd$exp.rel, useNA="ifany") # assign observed relationships ibd$obs.rel <- ibdAssignRelatedness(ibd$k0, ibd$k1) table(ibd$obs.rel, useNA="ifany") ################################################### ### code chunk number 76: DataCleaning.Rnw:1783-1785 ################################################### ibdPlot(ibd$k0, ibd$k1, relation=ibd$exp.rel, main="HapMap YRI IBD Estimates \nPLINK MOM Approach") ################################################### ### code chunk number 77: DataCleaning.Rnw:1813-1836 ################################################### filt <- get(data(pcaSnpFilters.hg18)) chrom <- getChromosome(snpAnnot) pos <- getPosition(snpAnnot) snpID <- getSnpID(snpAnnot) snp.filt <- rep(TRUE, length(snpID)) for (f in 1:nrow(filt)) { snp.filt[chrom == filt$chrom[f] & filt$start.base[f] < pos & pos < filt$end.base[f]] <- FALSE } snp.sel <- snpID[snp.filt] length(snp.sel) sample.sel <- scanAnnot$scanID[!scanAnnot$duplicated] length(sample.sel) gdsobj <- openfn.gds(gdsfile) snpset <- snpgdsLDpruning(gdsobj, sample.id=sample.sel, snp.id=snp.sel, autosome.only=TRUE, maf=0.05, missing.rate=0.05, method="corr", slide.max.bp=10e6, ld.threshold=sqrt(0.1)) snp.pruned <- unlist(snpset, use.names=FALSE) length(snp.pruned) ################################################### ### code chunk number 78: DataCleaning.Rnw:1842-1846 ################################################### pca <- snpgdsPCA(gdsobj, sample.id=sample.sel, snp.id=snp.pruned) names(pca) length(pca$eigenval) dim(pca$eigenvect) ################################################### ### code chunk number 79: DataCleaning.Rnw:1852-1860 ################################################### # Calculate the percentage of variance explained # by each principal component. pc.frac <- pca$eigenval/sum(pca$eigenval) lbls <- paste("EV", 1:4, "\n", format(pc.frac[1:4], digits=2), sep="") samp <- pData(scanAnnot)[match(pca$sample.id, scanAnnot$scanID),] cols <- rep(NA, nrow(samp)) cols[samp$race == "CEU"] <- "blue" cols[samp$race == "YRI"] <- "red" ################################################### ### code chunk number 80: DataCleaning.Rnw:1863-1865 ################################################### pairs(pca$eigenvect[,1:4], col=cols, labels=lbls, main = "CEU: blue, YRI: red") ################################################### ### code chunk number 81: DataCleaning.Rnw:1876-1889 ################################################### par.coord <- pca$eigenvect rangel <- apply(par.coord, 2, function(x) range(x)[1]) rangeh <- apply(par.coord, 2, function(x) range(x)[2]) std.coord <- par.coord for (i in 1:14) std.coord[,i] <- (par.coord[,i] - rangel[i])/(rangeh[i]-rangel[i]) plot(c(0,15), c(0,1), type = 'n', axes = FALSE, ylab = "", xlab = "", main = "Parallel Coordinates Plot CEU: blue, YRI: red") for (j in 1:13) for (i in sample(1:nrow(std.coord)) ) lines(c(j,j+1), std.coord[i,c(j,j+1)], col=cols[i], lwd=0.25) axis(1, at = 1:14, labels = paste("PC",1:14, sep = ".")) ################################################### ### code chunk number 82: DataCleaning.Rnw:1899-1910 ################################################### corr <- snpgdsPCACorr(pca, gdsobj, eig.which=1:4) closefn.gds(gdsobj) snp <- snpAnnot[match(corr$snp.id, snpID),] chrom <- getChromosome(snp, char=TRUE) pdf("DataCleaning-corr.pdf") par(mfrow=c(4,1)) for (i in 1:4) { snpCorrelationPlot(abs(corr$snpcorr[i,]), chrom, main=paste("Eigenvector",i), ylim=c(0,1)) } dev.off() ################################################### ### code chunk number 83: DataCleaning.Rnw:1914-1915 ################################################### file.remove(gdsfile) ################################################### ### code chunk number 84: DataCleaning.Rnw:1935-1940 ################################################### princomp <- as.data.frame(pca$eigenvect) samples.nodup <- pData(scanAnnot)[!scanAnnot$duplicated,] princomp$scanID <- as.factor(samples.nodup$scanID) princomp$case.ctrl.status <- as.factor(samples.nodup$status) princomp$race <- as.factor(samples.nodup$race) ################################################### ### code chunk number 85: DataCleaning.Rnw:1946-1953 ################################################### pc.percent <- 100 * pca$eigenval[1:32]/sum(pca$eigenval) pc.percent lbls <- paste("EV", 1:3, "\n", format(pc.percent[1:3], digits=2), "%", sep="") table(samples.nodup$status) cols <- rep(NA, nrow(samples.nodup)) cols[samples.nodup$status == 1] <- "green" cols[samples.nodup$status == 0] <- "magenta" ################################################### ### code chunk number 86: DataCleaning.Rnw:1963-1965 ################################################### pairs(pca$eigenvect[,1:3], col=cols, labels=lbls, main = "First Three EVs by Case-Control Status") ################################################### ### code chunk number 87: DataCleaning.Rnw:1968-1970 ################################################### boxplot(princomp[, 1] ~ princomp$case.ctrl.status, ylab = "PC1", main = "PC1 vs. Case-control Status") ################################################### ### code chunk number 88: DataCleaning.Rnw:1973-1975 ################################################### boxplot(princomp[, 2] ~ princomp$case.ctrl.status, ylab = "PC2", main = "PC2 vs. Case-control Status") ################################################### ### code chunk number 89: DataCleaning.Rnw:1978-1980 ################################################### boxplot(princomp[, 3] ~ princomp$case.ctrl.status, ylab = "PC3", main = "PC3 vs. Case-control Status") ################################################### ### code chunk number 90: DataCleaning.Rnw:1983-1992 ################################################### aov.p1 <- aov(princomp[,1] ~ princomp$race * princomp$case.ctrl.status, princomp) summary(aov.p1) aov.p2 <- aov(princomp[,2] ~ princomp$race * princomp$case.ctrl.status, princomp) summary(aov.p2) aov.p3 <- aov(princomp[,3] ~ princomp$race * princomp$case.ctrl.status, princomp) summary(aov.p3) ################################################### ### code chunk number 91: DataCleaning.Rnw:2012-2014 ################################################### lm.all <- lm(scanAnnot$missing.e1 ~ scanAnnot$status) summary(aov(lm.all)) ################################################### ### code chunk number 92: DataCleaning.Rnw:2017-2019 ################################################### boxplot(scanAnnot$missing.e1 ~ scanAnnot$status, ylab = "Mean missing call rate", main="Mean missing call rate by case status") ################################################### ### code chunk number 93: DataCleaning.Rnw:2035-2042 ################################################### blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata") blnc <- NcdfIntensityReader(blfile) blData <- IntensityData(blnc, snpAnnot=snpAnnot, scanAnnot=scanAnnot) genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") genonc <- NcdfGenotypeReader(genofile) genoData <- GenotypeData(genonc, snpAnnot=snpAnnot, scanAnnot=scanAnnot) ################################################### ### code chunk number 94: DataCleaning.Rnw:2047-2050 ################################################### baf.sd <- sdByScanChromWindow(blData, genoData, var="BAlleleFreq") med.baf.sd <- medianSdOverAutosomes(baf.sd) low.qual.ids <- med.baf.sd$scanID[med.baf.sd$med.sd > 0.05] ################################################### ### code chunk number 95: DataCleaning.Rnw:2055-2072 ################################################### chrom <- snpAnnot$chromosome pos <- snpAnnot$position hla.df <- get(data(HLA.hg18)) hla <- chrom == "6" & pos >= hla.df$start.base & pos <= hla.df$end.base xtr.df <- get(data(pseudoautosomal.hg18)) xtr <- chrom == "X" & pos >= xtr.df["X.XTR", "start.base"] & pos <= xtr.df["X.XTR", "end.base"] centromeres <- get(data(centromeres.hg18)) gap <- rep(FALSE, nrow(snpAnnot)) for (i in 1:nrow(centromeres)) { ingap <- chrom == centromeres$chrom[i] & pos > centromeres$left.base[i] & pos < centromeres$right.base[i] gap <- gap | ingap } ignore <- snpAnnot$missing.n1 == 1 #ignore includes intensity-only and failed snps snp.exclude <- ignore | hla | xtr | gap snp.ok <- snpAnnot$snpID[!snp.exclude] ################################################### ### code chunk number 96: DataCleaning.Rnw:2077-2082 ################################################### scan.ids <- scanAnnot$scanID[1:10] chrom.ids <- 21:23 baf.seg <- anomSegmentBAF(blData, genoData, scan.ids=scan.ids, chrom.ids=chrom.ids, snp.ids=snp.ok, verbose=FALSE) head(baf.seg) ################################################### ### code chunk number 97: DataCleaning.Rnw:2087-2093 ################################################### baf.anom <- anomFilterBAF(blData, genoData, segments=baf.seg, snp.ids=snp.ok, centromere=centromeres, low.qual.ids=low.qual.ids, verbose=FALSE) names(baf.anom) baf.filt <- baf.anom$filtered head(baf.filt) ################################################### ### code chunk number 98: DataCleaning.Rnw:2105-2111 ################################################### loh.anom <- anomDetectLOH(blData, genoData, scan.ids=scan.ids, chrom.ids=chrom.ids, snp.ids=snp.ok, known.anoms=baf.filt, verbose=FALSE) names(loh.anom) loh.filt <- loh.anom$filtered head(loh.filt) ################################################### ### code chunk number 99: DataCleaning.Rnw:2120-2134 ################################################### # create required data frame baf.filt$method <- "BAF" if (!is.null(loh.filt)) { loh.filt$method <- "LOH" cols <- intersect(names(baf.filt), names(loh.filt)) anoms <- rbind(baf.filt[,cols], loh.filt[,cols]) } else { anoms <- baf.filt } anoms$anom.id <- 1:nrow(anoms) stats <- anomSegStats(blData, genoData, snp.ids=snp.ok, anom=anoms, centromere=centromeres) names(stats) ################################################### ### code chunk number 100: DataCleaning.Rnw:2140-2143 ################################################### snp.not.ok <- snpAnnot$snpID[snp.exclude] anomStatsPlot(blData, genoData, anom.stats=stats[1,], snp.ineligible=snp.not.ok, centromere=centromeres, cex.leg=1) ################################################### ### code chunk number 101: DataCleaning.Rnw:2153-2155 ################################################### lrr.sd <- sdByScanChromWindow(blData, var="LogRRatio", incl.hom=TRUE) med.lrr.sd <- medianSdOverAutosomes(lrr.sd) ################################################### ### code chunk number 102: DataCleaning.Rnw:2161-2163 ################################################### baf.seg.info <- baf.anom$seg.info loh.seg.info <- loh.anom$base.info[,c("scanID", "chromosome", "num.segs")] ################################################### ### code chunk number 103: DataCleaning.Rnw:2173-2178 ################################################### snpAnnot$eligible <- !snp.exclude baf.low.qual <- anomIdentifyLowQuality(snpAnnot, med.baf.sd, baf.seg.info, sd.thresh=0.1, sng.seg.thresh=0.0008, auto.seg.thresh=0.0001) loh.low.qual <- anomIdentifyLowQuality(snpAnnot, med.lrr.sd, loh.seg.info, sd.thresh=0.25, sng.seg.thresh=0.0048, auto.seg.thresh=0.0006) ################################################### ### code chunk number 104: DataCleaning.Rnw:2183-2185 ################################################### close(blData) close(genoData) ################################################### ### code chunk number 105: DataCleaning.Rnw:2200-2211 ################################################### # anomalies to filter anom.filt <- stats[,c("scanID", "chromosome", "left.base", "right.base")] # whole.chrom column is required and can be used for sex chromosome # anomalies such as XXX anom.filt$whole.chrom <- FALSE # select unique subjects subj <- scanAnnot$scanID[!scanAnnot$duplicated] subj.filt.ncdf <- "subj_filt.ncdf" ncdfSetMissingGenotypes(genofile, subj.filt.ncdf, anom.filt, sample.include=subj) (nc <- NcdfGenotypeReader(subj.filt.ncdf)) close(nc) ################################################### ### code chunk number 106: DataCleaning.Rnw:2236-2238 ################################################### scan.excl <- scanAnnot$scanID[scanAnnot$missing.e1 >= 0.05] length(scan.excl) ################################################### ### code chunk number 107: DataCleaning.Rnw:2249-2280 ################################################### snp.excl <- snpAnnot$snpID[snpAnnot$missing.n1 == 1] length(snp.excl) genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") genoNC <- NcdfGenotypeReader(genofile) genoData <- GenotypeData(genoNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot) dupdisc <- duplicateDiscordance(genoData, subjName.col="subjectID", scan.exclude=scan.excl, snp.exclude=snp.excl) names(dupdisc) head(dupdisc$discordance.by.snp) length(dupdisc$discordance.by.subject) dupdisc$discordance.by.subject[[2]] # each entry is a 2x2 matrix, but only one value of each # is important since these are all pairs npair <- length(dupdisc$discordance.by.subject) disc.subj <- rep(NA, npair) subjID <- rep(NA, npair) race <- rep(NA, npair) for (i in 1:npair) { disc.subj[i] <- dupdisc$discordance.by.subject[[i]][1,2] subjID[i] <- names(dupdisc$discordance.by.subject)[i] race[i] <- scanAnnot$race[scanAnnot$subjectID == subjID[i]][1] } dat <- data.frame(subjID=subjID, disc=disc.subj, pop=race, stringsAsFactors=FALSE) summary(dat$disc) # Assign colors for the duplicate samples based on population group. dat$col <- NA dat$col[dat$pop == "CEU"] <- "blue" dat$col[dat$pop == "YRI"] <- "red" dat <- dat[order(dat$disc),] dat$rank <- 1:npair ################################################### ### code chunk number 108: DataCleaning.Rnw:2283-2288 ################################################### # Plot the sample discordance rates color coded by race. plot(dat$disc, dat$rank, col=dat$col, ylab="rank", xlab="Discordance rate between duplicate samples", main="Duplicate Sample Discordance by Continental Ancestry") legend("bottomright", unique(dat$pop), pch=rep(1,2), col=unique(dat$col)) ################################################### ### code chunk number 109: DataCleaning.Rnw:2312-2313 ################################################### duplicateDiscordanceProbability(npair) ################################################### ### code chunk number 110: DataCleaning.Rnw:2334-2346 ################################################### men.list <- with(pData(scanAnnot), mendelList(family, subjectID, father, mother, sex, scanID)) res <- mendelListAsDataFrame(men.list) head(res) dim(res) # Only want to use SNPs with missing.n1 < 0.05 snp.excl <- snpAnnot$snpID[snpAnnot$missing.n1 >= 0.05] length(snp.excl) mend <- mendelErr(genoData, men.list, snp.exclude=snp.excl) names(mend) head(mend$trios) names(mend$snp) ################################################### ### code chunk number 111: DataCleaning.Rnw:2356-2359 ################################################### # Calculate the error rate err <- mend$snp$error.cnt / mend$snp$check.cnt table(err == 0, useNA="ifany") ################################################### ### code chunk number 112: DataCleaning.Rnw:2362-2364 ################################################### plot(err, rank(err), xlab="Error Rate (fraction)", ylab="rank", main="Mendelian Error Rate per SNP, ranked") ################################################### ### code chunk number 113: DataCleaning.Rnw:2372-2384 ################################################### fam <- mend$snp$error.cnt n <- mend$snp$check.cnt summary(fam) # SNPs with errors length(fam[n > 0 & fam > 0]) # SNPs for which more than one family has an error length(fam[n > 0 & fam > 1]) # Get the SNPs with valid trios for error detection val <- length(fam[n > 0]) noerr <- length(fam[n > 0 & fam == 0]) # Divide to get fraction with no errors noerr / val ################################################### ### code chunk number 114: DataCleaning.Rnw:2391-2404 ################################################### snp.sel <- match(names(mend$snp$error.cnt), snpAnnot$snpID) snpAnnot$mendel.err.count[snp.sel] <- mend$snp$error.cnt snpAnnot$mendel.err.sampsize[snp.sel] <- mend$snp$check.cnt allequal(snpAnnot$snpID, sort(snpAnnot$snpID)) # The high number of NA values is due to the filtering out of SNPs # before the Mendelian error rate calculation sum(is.na(snpAnnot$mendel.err.count)) sum(is.na(snpAnnot$mendel.err.sampsize)) varMetadata(snpAnnot)["mendel.err.count", "labelDescription"] <- paste("number of Mendelian errors detected in trios averaged over", "multiple combinations of replicate genotyping instances") varMetadata(snpAnnot)["mendel.err.sampsize", "labelDescription"] <- "number of opportunities to detect Mendelian error in trios" ################################################### ### code chunk number 115: DataCleaning.Rnw:2416-2433 ################################################### # Get a vector of SNPs to check snp <- pData(snpAnnot) snp$err.rate <- snp$mendel.err.count / snp$mendel.err.sampsize snp <- snp[order(snp$err.rate, decreasing=TRUE),] snp <- snp[1:9,] xyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata") xyNC <- NcdfIntensityReader(xyfile) xyData <- IntensityData(xyNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot) pdf(file="DataCleaning-mendel.pdf") par(mfrow = c(3,3)) mtxt <- paste("SNP", snp$rsID, "\nMendelian Error Rate", format(snp$err.rate, digits=5)) genoClusterPlot(xyData, genoData, snpID=snp$snpID, main.txt=mtxt, cex.main=0.9) dev.off() close(xyData) ################################################### ### code chunk number 116: DataCleaning.Rnw:2446-2463 ################################################### # Calculate the fraction of SNPs with an error for each trio trios <- mend$trios trios$Mend.err <- trios$Men.err.cnt/trios$Men.cnt summary(trios$Mend.err) # Start by pulling out the vectors needed from `trios' tmp <- trios[, c("fam.id", "Mend.err")]; dim(tmp) # Change fam.id to match the sample annotation column name names(tmp) <- c("family", "Mend.err.rate.fam") # Merge the variables into the sample annotation file scanAnnot$mend.err.rate.fam <- NA for (i in 1:nrow(tmp)) { ind <- which(is.element(scanAnnot$family, tmp$family[i])) scanAnnot$mend.err.rate.fam[ind] <- tmp$Mend.err.rate.fam[i] } head(scanAnnot$mend.err.rate.fam) varMetadata(scanAnnot)["mend.err.rate.fam", "labelDescription"] <- "Mendelian error rate per family" ################################################### ### code chunk number 117: DataCleaning.Rnw:2471-2482 ################################################### # Get the families that have non-NA values for the family # Mendelian error rate fams <- pData(scanAnnot)[!is.na(scanAnnot$mend.err.rate.fam) & !duplicated(scanAnnot$family), c("family", "mend.err.rate.fam", "race")] dim(fams) table(fams$race, useNA="ifany") # Assign colors for the different ethnicities in these families pcol <- rep(NA, nrow(fams)) pcol[fams$race == "CEU"] <- "blue" pcol[fams$race == "YRI"] <- "red" ################################################### ### code chunk number 118: DataCleaning.Rnw:2485-2490 ################################################### plot(fams$mend.err.rate.fam*100, rank(fams$mend.err.rate.fam), main="Mendelian Error rate per Family, ranked", xlab="Mendelian error rate per family (percent)", ylab="rank", col=pcol) legend("bottomright", c("CEU", "YRI"), pch=c(1,1), col=c("blue", "red")) ################################################### ### code chunk number 119: DataCleaning.Rnw:2508-2517 ################################################### head(pData(scanAnnot)[,c("father", "mother")]) nonfounders <- scanAnnot$father != 0 & scanAnnot$mother != 0 table(nonfounders) scan.excl <- scanAnnot$scanID[scanAnnot$race != "CEU" | nonfounders | scanAnnot$duplicated] length(scan.excl) hwe <- gwasExactHW(genoData, scan.exclude=scan.excl) close(genoData) ################################################### ### code chunk number 120: DataCleaning.Rnw:2525-2536 ################################################### names(hwe) dim(hwe) # Check on sample sizes for autosomes and X chromosome hwe$N <- hwe$nAA + hwe$nAB + hwe$nBB summary(hwe$N[is.element(hwe$chromosome,1:22)]) summary(hwe$N[is.element(hwe$chromosome,23)]) hwe$p.value[1:10] sum(is.na(hwe$p.value[hwe$chromosome == 24])) # XY sum(is.na(hwe$p.value[hwe$chromosome == 23])) # X hwe$MAF[1:10] hwe[1:3, c("nAA", "nAB", "nBB")] ################################################### ### code chunk number 121: DataCleaning.Rnw:2543-2544 ################################################### summary(hwe$f) ################################################### ### code chunk number 122: DataCleaning.Rnw:2547-2549 ################################################### hist(hwe$f, main="Histogram of the Inbreeding Coefficient For CEU Samples", xlab="Inbreeding Coefficient") ################################################### ### code chunk number 123: DataCleaning.Rnw:2552-2555 ################################################### # Check the MAF of those SNPs with f=1 chkf <- hwe[!is.na(hwe$f) & hwe$f==1,]; dim(chkf) summary(chkf$MAF) ################################################### ### code chunk number 124: DataCleaning.Rnw:2562-2573 ################################################### hwe.0 <- hwe[hwe$MAF > 0,]; dim(hwe.0) # Only keep the autosomal SNPs for first plot pval <- hwe.0$p.value[is.element(hwe.0$chromosome, 1:22)] length(pval) pval <- pval[!is.na(pval)] length(pval) # X chromosome SNPs for plot 2 pval.x <- hwe.0$p.value[is.element(hwe.0$chromosome, 23)] length(pval.x) pval.x <- pval.x[!is.na(pval.x)] length(pval.x) ################################################### ### code chunk number 125: DataCleaning.Rnw:2575-2582 ################################################### pdf(file = "DataCleaning-hwe.pdf") par(mfrow=c(2,2)) qqPlot(pval=pval, truncate = FALSE, main="Autosomes, all") qqPlot(pval=pval, truncate = TRUE, main="Autosomes, truncated") qqPlot(pval=pval.x, truncate = FALSE, main="X chromosome, all") qqPlot(pval=pval.x, truncate = TRUE, main="X chromosome, truncated") dev.off() ################################################### ### code chunk number 126: DataCleaning.Rnw:2588-2591 ################################################### plot(hwe.0$MAF, -log10(hwe.0$p.value), xlab="Minor Allele Frequency", ylab="-log(p-value)", main="Minor Allele Frequency vs\nP-value") ################################################### ### code chunk number 127: DataCleaning.Rnw:2627-2634 ################################################### genoNC <- NcdfGenotypeReader(subj.filt.ncdf) subjAnnot <- scanAnnot[scanAnnot$scanID %in% getScanID(genoNC),] genoData <- GenotypeData(genoNC, scanAnnot=subjAnnot) assoc <- assocTestRegression(genoData, outcome="status", covar.list=c(""), ivar.list=c(""), model.type="logistic", robust=TRUE, chromosome.set=c(24:26)) close(genoData) ################################################### ### code chunk number 128: DataCleaning.Rnw:2648-2650 ################################################### qqPlot(pval=assoc$model.1.additive.LR.pval.G, truncate=TRUE, main="QQ Plot of Likelihood Ratio Test p-values") ################################################### ### code chunk number 129: DataCleaning.Rnw:2658-2660 ################################################### chrom <- getChromosome(snpAnnot, char=TRUE) chrom.sel <- chrom %in% c("XY", "Y", "M") ################################################### ### code chunk number 130: DataCleaning.Rnw:2663-2665 ################################################### manhattanPlot(assoc$model.1.additive.LR.pval.G, chromosome=chrom[chrom.sel]) ################################################### ### code chunk number 131: DataCleaning.Rnw:2677-2696 ################################################### # Identify SNPs with lowest p-values snp <- pData(snpAnnot)[chrom.sel, c("snpID", "rsID")] allequal(snp$snpID, assoc$snpID) snp$pval <- assoc$model.1.additive.LR.pval.G snp <- snp[order(snp$pval),] snp <- snp[1:9,] xyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata") xyNC <- NcdfIntensityReader(xyfile) xyData <- IntensityData(xyNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot) genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") genoNC <- NcdfGenotypeReader(genofile) genoData <- GenotypeData(genoNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot) pdf(file="DataCleaning-cluster.pdf") par(mfrow = c(3,3)) mtxt <- paste("SNP", snp$rsID, "\np =", format(snp$pval, digits=4)) genoClusterPlot(xyData, genoData, snpID=snp$snpID, main.txt=mtxt) dev.off() close(xyData) close(genoData) ################################################### ### code chunk number 132: DataCleaning.Rnw:2702-2703 ################################################### unlink(subj.filt.ncdf)