### R code from vignette source 'vignettes/GWASTools/inst/doc/DataCleaning.Rnw' ### Encoding: UTF-8 ################################################### ### code chunk number 1: DataCleaning.Rnw:115-135 ################################################### library(GWASTools) library(GWASdata) # Load the SNP annotation (simple data frame) data(affy_snp_annot) # Create a SnpAnnotationDataFrame snpAnnot <- SnpAnnotationDataFrame(affy_snp_annot) # names of columns varLabels(snpAnnot) # data head(pData(snpAnnot)) # Add metadata to describe the columns meta <- varMetadata(snpAnnot) meta[c("snpID", "chromosome", "position", "rsID", "probeID"), "labelDescription"] <- c("unique integer ID for SNPs", paste("integer code for chromosome: 1:22=autosomes,", "23=X, 24=pseudoautosomal, 25=Y, 26=Mitochondrial, 27=Unknown"), "base pair position on chromosome (build 36)", "RS identifier", "unique ID from Affymetrix") varMetadata(snpAnnot) <- meta ################################################### ### code chunk number 2: DataCleaning.Rnw:157-184 ################################################### # Load the scan annotation (simple data frame) data(affy_scan_annot) # Create a ScanAnnotationDataFrame scanAnnot <- ScanAnnotationDataFrame(affy_scan_annot) # names of columns varLabels(scanAnnot) # data head(pData(scanAnnot)) # Add metadata to describe the columns meta <- varMetadata(scanAnnot) meta[c("scanID", "subjectID", "family", "father", "mother", "CoriellID", "race", "sex", "status", "genoRunID", "plate", "alleleFile", "chpFile"), "labelDescription"] <- c("unique integer ID for scans", "subject identifier (may have multiple scans)", "family identifier", "father identifier as subjectID", "mother identifier as subjectID", "Coriell subject identifier", "HapMap population group", "sex coded as M=male and F=female", "simulated case/control status" , "genotyping instance identifier", "plate containing samples processed together for genotyping chemistry", "data file with intensities", "data file with genotypes and quality scores") varMetadata(scanAnnot) <- meta ################################################### ### code chunk number 3: DataCleaning.Rnw:246-256 ################################################### geno.nc.file <- "tmp.geno.nc" # get snp annotation data frame for function snp <- affy_snp_annot[,c("snpID", "chromosome", "position")] ncdfCreate(ncdf.filename = geno.nc.file, snp.annotation = snp, variables = c("genotype"), array.name = "AffyGenomeWideSNP_6", genome.build = "36", n.samples = 3, precision = "single") ################################################### ### code chunk number 4: DataCleaning.Rnw:265-271 ################################################### # first 3 samples only scan.annotation <- affy_scan_annot[1:3, c("scanID", "genoRunID", "chpFile")] names(scan.annotation) <- c("scanID", "scanName", "file") # indicate which column of SNP annotation is referenced in data files snp.annotation <- affy_snp_annot[,c("snpID", "probeID")] names(snp.annotation) <- c("snpID", "snpName") ################################################### ### code chunk number 5: DataCleaning.Rnw:278-284 ################################################### col.nums <- as.integer(c(2,3,5,6)) names(col.nums) <- c("snp","geno","a1","a2") # Define a path to the raw data CHP text files which are read by # ncdfAddData to access the raw genotypic data path <- system.file("extdata", "affy_raw_data", package="GWASdata") ################################################### ### code chunk number 6: DataCleaning.Rnw:295-326 ################################################### diag.geno.file <- "diag.geno.RData" diag.geno <- ncdfAddData(path = path, ncdf.filename = geno.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, sep.type = "\t", skip.num = 1, col.total = 6, col.nums = col.nums, scan.name.in.file = -1, scan.start.index = 1, diagnostics.filename = diag.geno.file, verbose = FALSE) # Look at the values included in the "diag.geno" object which holds # all output from the function call names(diag.geno) # `read.file' is a vector indicating whether (1) or not (0) each file # specified in the `files' argument was read successfully table(diag.geno$read.file) # `row.num' is a vector of the number of rows read from each file table(diag.geno$row.num) # `sample.match' is a vector indicating whether (1) or not (0) # the sample name inside the raw text file matches that in the # sample annotation data.frame table(diag.geno$sample.match) # `snp.chk' is a vector indicating whether (1) or not (0) # the raw text file has the expected set of SNP names table(diag.geno$snp.chk) # `chk' is a vector indicating whether (1) or not (0) all previous # checks were successful and the data were written to the NetCDF file table(diag.geno$chk) ################################################### ### code chunk number 7: DataCleaning.Rnw:335-341 ################################################### (genofile <- NcdfGenotypeReader(geno.nc.file)) # Take out genotype data for the first 3 samples and # the first 5 SNPs (genos <- getGenotype(genofile, snp=c(1,5), scan=c(1,3))) # Close the NetCDF file close(genofile) ################################################### ### code chunk number 8: DataCleaning.Rnw:347-370 ################################################### check.geno.file <- "check.geno.RData" check.geno <- ncdfCheckGenotype(path = path, ncdf.filename = geno.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, sep.type = "\t", skip.num = 1, col.total = 6, col.nums = col.nums, scan.name.in.file = -1, check.scan.index = 1:3, n.scans.loaded = 3, diagnostics.filename = check.geno.file, verbose = FALSE) # Look at the values included in the "check.geno" object which holds # all output from the function call names(check.geno) # 'snp.order' is a vector indicating whether (1) or not (0) the snp ids # are in the same order in each file. table(check.geno$snp.order) # 'geno.chk' is a vector indicating whether (1) or not (0) the genotypes # in the netCDF match the text file table(check.geno$geno.chk) ################################################### ### code chunk number 9: DataCleaning.Rnw:414-424 ################################################### qxy.nc.file <- "tmp.qxy.nc" # get snp annotation data frame for function snp <- affy_snp_annot[,c("snpID", "chromosome", "position")] ncdfCreate(ncdf.filename = qxy.nc.file, snp.annotation = snp, variables = c("quality","X","Y"), array.name = "AffyGenomeWideSNP_6", genome.build = "36", n.samples = 3, precision = "single") ################################################### ### code chunk number 10: DataCleaning.Rnw:433-439 ################################################### # first 3 samples only scan.annotation <- affy_scan_annot[1:3, c("scanID", "genoRunID", "chpFile")] names(scan.annotation) <- c("scanID", "scanName", "file") # indicate which column of SNP annotation is referenced in data files snp.annotation <- affy_snp_annot[,c("snpID", "probeID")] names(snp.annotation) <- c("snpID", "snpName") ################################################### ### code chunk number 11: DataCleaning.Rnw:446-452 ################################################### col.nums <- as.integer(c(2,4)) names(col.nums) <- c("snp","qs") # Define a path to the raw data CHP text files which are read by # ncdfAddData to access the raw genotypic data path <- system.file("extdata", "affy_raw_data", package="GWASdata") ################################################### ### code chunk number 12: DataCleaning.Rnw:461-474 ################################################### diag.qual.file <- "diag.qual.RData" diag.qual <- ncdfAddData(path = path, ncdf.filename = qxy.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, sep.type = "\t", skip.num = 1, col.total = 6, col.nums = col.nums, scan.name.in.file = -1, scan.start.index = 1, diagnostics.filename = diag.qual.file, verbose = FALSE) ################################################### ### code chunk number 13: DataCleaning.Rnw:481-492 ################################################### scan.annotation <- affy_scan_annot[1:3, c("scanID", "genoRunID", "alleleFile")] names(scan.annotation) <- c("scanID", "scanName", "file") diag.xy.file <- "diag.xy.RData" diag.xy <- ncdfAddIntensity(path = path, ncdf.filename = qxy.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, scan.start.index = 1, n.consecutive.scans = 3, diagnostics.filename = diag.xy.file, verbose = FALSE) ################################################### ### code chunk number 14: DataCleaning.Rnw:499-506 ################################################### # Open the NetCDF file we just created (intenfile <- NcdfIntensityReader(qxy.nc.file)) # Take out the normalized X intensity values for the first # 5 SNPs for the first 3 samples (xinten <- getX(intenfile, snp=c(1,5), scan=c(1,3))) # Close the NetCDF file close(intenfile) ################################################### ### code chunk number 15: DataCleaning.Rnw:512-531 ################################################### scan.annotation <- affy_scan_annot[1:5, c("scanID", "genoRunID", "chpFile", "alleleFile")] names(scan.annotation) <- c("scanID", "scanName", "file", "inten.file") check.qxy.file <- "check.qxy.RData" check.qxy <- ncdfCheckIntensity(path = path, intenpath = path, ncdf.filename = qxy.nc.file, snp.annotation = snp.annotation, scan.annotation = scan.annotation, sep.type = "\t", skip.num = 1, col.total = 6, col.nums = col.nums, scan.name.in.file = -1, check.scan.index = 1:3, n.scans.loaded = 3, affy.inten = TRUE, diagnostics.filename = check.qxy.file, verbose = FALSE) ################################################### ### code chunk number 16: DataCleaning.Rnw:628-638 ################################################### bl.nc.file <- "tmp.bl.nc" # get snp annotation data frame for function snp <- affy_snp_annot[,c("snpID", "chromosome", "position")] ncdfCreate(ncdf.filename = bl.nc.file, snp.annotation = snp, variables = c("BAlleleFreq","LogRRatio"), array.name = "AffyGenomeWideSNP_6", genome.build = "36", n.samples = 3, precision = "single") ################################################### ### code chunk number 17: DataCleaning.Rnw:650-664 ################################################### xyNC <- NcdfIntensityReader(qxy.nc.file) genoNC <- NcdfGenotypeReader(geno.nc.file) BAFfromGenotypes(xyNC, genoNC, bl.ncdf.filename = bl.nc.file, min.n.genotypes = 0, call.method ="by.study") close(xyNC) close(genoNC) # Open the NetCDF file we just created (blfile <- NcdfIntensityReader(bl.nc.file)) # Look at the BAlleleFrequency values for the first 5 SNPs (baf <- getBAlleleFreq(blfile, snp=c(1,5), scan=c(1,3))) # Close the NetCDF file close(blfile) ################################################### ### code chunk number 18: DataCleaning.Rnw:669-672 ################################################### file.remove(geno.nc.file, qxy.nc.file, bl.nc.file) file.remove(diag.geno.file, diag.qual.file, diag.xy.file) file.remove(check.geno.file, check.qxy.file) ################################################### ### code chunk number 19: DataCleaning.Rnw:720-739 ################################################### library(GWASTools) library(GWASdata) # sex is required for this function, so we load the scan annotation data(affyScanADF) # ScanAnnotationDataFrame scanAnnot <- affyScanADF # open the netCDF file and create a GenotypeData object ncfile <- system.file("extdata", "affy_geno.nc", package="GWASdata") nc <- NcdfGenotypeReader(ncfile) genoData <- GenotypeData(nc, scanAnnot=scanAnnot) # Calculate the number of missing calls for each snp over all samples # for each gender separately miss <- missingGenotypeBySnpSex(genoData) # Examine the results length(miss) names(miss) head(miss$missing.counts) miss$scans.per.sex head(miss$missing.fraction) ################################################### ### code chunk number 20: DataCleaning.Rnw:748-756 ################################################### # get snp annotation data(affySnpADF) # SnpAnnotationDataFrame snpAnnot <- affySnpADF snpAnnot$missing.n1 <- miss$missing.fraction varMetadata(snpAnnot)["missing.n1", "labelDescription"] <- paste( "fraction of genotype calls missing over all samples", "except that females are excluded for Y chr SNPs") summary(snpAnnot$missing.n1) ################################################### ### code chunk number 21: DataCleaning.Rnw:766-769 ################################################### hist(snpAnnot$missing.n1[snpAnnot$chromosome == 25], xlab="Fraction of Y chr SNP missing calls", main="Y Chromosome SNP Missing Call Rate for Males") ################################################### ### code chunk number 22: DataCleaning.Rnw:780-783 ################################################### hist(snpAnnot$missing.n1, ylim=c(0,100), xlab="SNP missing call rate", main="Missing Call Rate for All Probes") ################################################### ### code chunk number 23: DataCleaning.Rnw:790-801 ################################################### # Find the number of SNPs with every call missing length(snpAnnot$missing.n1[snpAnnot$missing.n1 == 1]) # Fraction of autosomal SNPs with missing call rate < 0.05 x <- snpAnnot$missing.n1[snpAnnot$chromosome < 23] length(x[x < 0.05]) / length(x) # Fraction of X chromosome SNPs with missing call rate < 0.05 x <- snpAnnot$missing.n1[snpAnnot$chromosome == 23] length(x[x < 0.05]) / length(x) # Fraction of Y chromosome SNPs with missing call rate < 0.05 x <- snpAnnot$missing.n1[snpAnnot$chromosome == 25] length(x[x < 0.05]) / length(x) ################################################### ### code chunk number 24: DataCleaning.Rnw:822-830 ################################################### # Want to exclude all SNP probes with 100% missing call rate # Check on how many SNPs to exclude sum(abs(snpAnnot$missing.n1 - 1) < 1e-6) sum(snpAnnot$missing.n1 == 1) # Create a variable that contains the IDs of these SNPs to exclude snpexcl <- snpAnnot$snpID[abs(snpAnnot$missing.n1 - 1) < 1e-6 | snpAnnot$missing.n1 == 1] length(snpexcl) ################################################### ### code chunk number 25: DataCleaning.Rnw:832-841 ################################################### # Use the missingGenotypeByScanChrom function miss <- missingGenotypeByScanChrom(genoData, snp.exclude=snpexcl) length(miss) names(miss) head(miss$missing.counts) head(miss$snps.per.chr) # Check to make sure that the correct number of SNPs were excluded sum(miss$snps.per.chr) nrow(snpAnnot) - sum(miss$snps.per.chr) ################################################### ### code chunk number 26: DataCleaning.Rnw:848-857 ################################################### head(miss$missing.fraction) # Check the ordering matches the sample annotation file allequal(names(miss$missing.fraction), scanAnnot$scanID) # Add the missing call rates vector to the sample annotation file scanAnnot$missing.e1 <- miss$missing.fraction varMetadata(scanAnnot)["missing.e1", "labelDescription"] <- paste( "fraction of genotype calls missing over all snps with missing.n1<1", "except that Y chr SNPs are excluded for females") summary(scanAnnot$missing.e1) ################################################### ### code chunk number 27: DataCleaning.Rnw:872-875 ################################################### hist(scanAnnot$missing.e1, xlab="Fraction of missing calls over all probes", main="Histogram of Sample Missing Call Rate for all Samples") ################################################### ### code chunk number 28: DataCleaning.Rnw:881-887 ################################################### # Look at missing.e1 for males summary(scanAnnot$missing.e1[scanAnnot$sex == "M"]) # Look at missing.e1 for females summary(scanAnnot$missing.e1[scanAnnot$sex == "F"]) # Number of samples with missing call rate > 5% sum(scanAnnot$missing.e1 > 0.05) ################################################### ### code chunk number 29: DataCleaning.Rnw:898-921 ################################################### auto <- colnames(miss$missing.counts) %in% 1:22 missa <- rowSums(miss$missing.counts[,auto]) / sum(miss$snps.per.chr[auto]) summary(missa) missx <- miss$missing.counts[,"X"] / miss$snps.per.chr["X"] summary(missx) # check they match sample annotation file allequal(names(missa), scanAnnot$scanID) allequal(names(missx), scanAnnot$scanID) # Add these separate sample missing call rates to the sample # annotation scanAnnot$miss.e1.auto <- missa scanAnnot$miss.e1.xchr <- missx # Order scanAnnot by missing.e1 so duplicate subjectIDs # with a higher missing rate are marked as duplicates scanAnnot <- scanAnnot[order(scanAnnot$subjectID, scanAnnot$missing.e1),] scanAnnot$duplicated <- duplicated(scanAnnot$subjectID) table(scanAnnot$duplicated, exclude=NULL) # Put scanAnnot back in scanID order; this is very important!! # Save the updated sample annotation scanAnnot <- scanAnnot[order(scanAnnot$scanID),] allequal(scanAnnot$scanID, sort(scanAnnot$scanID)) varMetadata(scanAnnot)["duplicated", "labelDescription"] <- "TRUE for duplicate scan with higher missing.e1" ################################################### ### code chunk number 30: DataCleaning.Rnw:942-962 ################################################### sum(scanAnnot$missing.e1 > 0.05) # Even though there are no samples with missing.e1 > 0.05, # we will go through the full process for the tutorial # Find the samples with missing.e1 > .05 and make a vector of # scanID to exclude from the calculation scan.exclude <- scanAnnot$scanID[scanAnnot$missing.e1 > 0.05] # Call missingGenotypeBySnpSex and save the output miss <- missingGenotypeBySnpSex(genoData, scan.exclude=scan.exclude) length(miss) names(miss) # Check that all SNPs are listed dim(miss$missing.counts) miss$scans.per.sex dim(miss$missing.fraction) # Make sure ordering matches snp annotation allequal(snpAnnot$snpID, as.numeric(names(miss$missing.fraction))) snpAnnot$missing.n2 <- miss$missing.fraction varMetadata(snpAnnot)["missing.n2", "labelDescription"] <- paste( "fraction of genotype calls missing over all samples with missing.e1<0.05", "except that females are excluded for Y chr SNPs") summary(snpAnnot$missing.n2) ################################################### ### code chunk number 31: DataCleaning.Rnw:972-975 ################################################### hist(snpAnnot$missing.n2[snpAnnot$chromosome == 25], xlab="Fraction of Y chr SNP missing calls", main="Y Chromosome SNP Missing Call Rate for Males") ################################################### ### code chunk number 32: DataCleaning.Rnw:986-988 ################################################### hist(snpAnnot$missing.n2, ylim=c(0,10000), xlab="SNP missing call rate", main="Histogram of SNP Missing Call Rate") ################################################### ### code chunk number 33: DataCleaning.Rnw:995-1006 ################################################### # Find the number of SNPs with every call missing length(snpAnnot$missing.n2[snpAnnot$missing.n2 == 1]) # Fraction of autosomal SNPs with missing call rate < .05 x <- snpAnnot$missing.n2[snpAnnot$chromosome < 23] length(x[x < 0.05]) / length(x) # Fraction of X chromosome SNPs with missing call rate <.05 x <- snpAnnot$missing.n2[snpAnnot$chromosome == 23] length(x[x < 0.05]) / length(x) # Fraction of Y chromosome SNPs with missing call rate <.05 x <- snpAnnot$missing.n2[snpAnnot$chromosome == 25] length(x[x < 0.05]) / length(x) ################################################### ### code chunk number 34: DataCleaning.Rnw:1025-1028 ################################################### # Create a vector of the SNPs to exclude. snpexcl <- snpAnnot$snpID[snpAnnot$missing.n2 >= 0.05] length(snpexcl) ################################################### ### code chunk number 35: DataCleaning.Rnw:1030-1050 ################################################### # Use the missingGenotypeByScanChrom function miss <- missingGenotypeByScanChrom(genoData, snp.exclude=snpexcl) length(miss) names(miss) head(miss$missing.counts) head(miss$snps.per.chr) # Check to make sure that the correct number of SNPs were excluded sum(miss$snps.per.chr) nrow(snpAnnot) - sum(miss$snps.per.chr) head(miss$missing.fraction) # Check the ordering matches the sample annotation file allequal(names(miss$missing.fraction), scanAnnot$scanID) # Add the missing call rates vector to the sample annotation file scanAnnot$missing.e2 <- miss$missing.fraction varMetadata(scanAnnot)["missing.e2", "labelDescription"] <- paste( "fraction of genotype calls missing over all snps with missing.n2<0.05", "except that Y chr SNPs are excluded for females") summary(scanAnnot$missing.e2) # Make sure the samples are still in order allequal(scanAnnot$scanID, sort(scanAnnot$scanID)) ################################################### ### code chunk number 36: DataCleaning.Rnw:1061-1064 ################################################### hist(scanAnnot$missing.e2, xlab="Fraction of missing calls over all probes with missing call rate < 0.05", main="Histogram of Sample Missing Call Rate for all Samples") ################################################### ### code chunk number 37: DataCleaning.Rnw:1071-1075 ################################################### # Look at missing.e2 for males summary(scanAnnot$missing.e2[scanAnnot$sex == "M"]) # Look at missing.e2 for females summary(scanAnnot$missing.e2[scanAnnot$sex == "F"]) ################################################### ### code chunk number 38: DataCleaning.Rnw:1080-1084 ################################################### plot(sort(scanAnnot$missing.e2), rank(sort(scanAnnot$missing.e2)), xlab="Fraction of missing calls over probes with MCR < 0.05", ylab="rank", cex.lab=0.75, main="Sorted Fraction of Missing Call Rate \nOver All Probes with MCR < 0.05") ################################################### ### code chunk number 39: DataCleaning.Rnw:1105-1110 ################################################### varLabels(scanAnnot) # Check how many batches exist and how many samples are in each batch sum(is.na(scanAnnot$plate)) length(unique(scanAnnot$plate)) table(scanAnnot$plate, exclude=NULL) ################################################### ### code chunk number 40: DataCleaning.Rnw:1115-1120 ################################################### # Plot the distribution of the number of samples per batch. barplot(table(scanAnnot$plate), ylab="Number of Samples", xlab="Batch", main="Distribution of Samples per Batch", cex.axis=0.85, cex.names=0.75, cex.lab=0.9) ################################################### ### code chunk number 41: DataCleaning.Rnw:1126-1135 ################################################### # Examine the mean missing call rate per batch for all SNPs batches <- unique(scanAnnot$plate) bmiss <- rep(NA,length(batches)); names(bmiss) <- batches bn <- rep(NA,length(batches)); names(bn) <- batches for(i in 1:length(batches)) { x <- scanAnnot$missing.e1[is.element(scanAnnot$plate, batches[i])] bmiss[i] <- mean(x) bn[i] <- length(x) } ################################################### ### code chunk number 42: DataCleaning.Rnw:1146-1148 ################################################### y <- lm(bmiss ~ bn) anova(y) ################################################### ### code chunk number 43: DataCleaning.Rnw:1153-1157 ################################################### plot(bn, bmiss, xlab="Number of samples per batch", ylab="Mean missing call rate", main="Mean Missing Call Rate vs\nSamples per Batch", cex.main=0.8) abline(y$coefficients) ################################################### ### code chunk number 44: DataCleaning.Rnw:1184-1193 ################################################### # Perform the Chi-square test using the batchChisqTest function res <- batchChisqTest(genoData, batchVar="plate", return.by.snp=TRUE) close(genoData) # chi-square values for each SNP dim(res$chisq) # genomic inflation factor res$lambda # Examine the average chi-square test statistics for each of the batches res$mean.chisq ################################################### ### code chunk number 45: DataCleaning.Rnw:1198-1218 ################################################### x <- table(scanAnnot$race, exclude=NULL) x x[1] / sum(x) x[2] / sum(x) x <- table(scanAnnot$race, scanAnnot$plate) x # Run an approximate chi-square test to see if there are ethnic effects chisq <- chisq.test(x) chisq$p.value # Check if ethnicity CEU has a batch effect # Calculate the fraction of samples in each batch that are CEU ethnicity batches <- unique(scanAnnot$plate) length(batches) eth <- rep(NA,length(batches)); names(eth) <- sort(batches) for(i in 1:length(batches)){ x <- scanAnnot$race[is.element(scanAnnot$plate, batches[i])] xl <- length(x[x == "CEU"]) eth[i] <- xl / length(x) } allequal(names(eth), names(res$mean.chisq)) ################################################### ### code chunk number 46: DataCleaning.Rnw:1223-1230 ################################################### # Plot the average Chi-Square test statistic against the # fraction of samples that are CEU plot(eth, res$mean.chisq, xlab="Fraction of CEU Samples per Batch", ylab="Average Chi-square Test Statistic", main="Fraction of CEU Samples per Batch vs Average Chi-square Test Statistic") abline(v=mean(eth), lty=2, col="red") ################################################### ### code chunk number 47: DataCleaning.Rnw:1265-1279 ################################################### library(GWASTools) library(GWASdata) data(illuminaScanADF) # ScanAnnotationDataFrame scanAnnot <- illuminaScanADF qxyfile <- system.file("extdata", "illumina_qxy.nc", package="GWASdata") qualNC <- NcdfIntensityReader(qxyfile) qualData <- IntensityData(qualNC, scanAnnot=scanAnnot) genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") genoNC <- NcdfGenotypeReader(genofile) genoData <- GenotypeData(genoNC, scanAnnot=scanAnnot) qual.results <- qualityScoreByScan(qualData, genoData) close(qualData) ################################################### ### code chunk number 48: DataCleaning.Rnw:1284-1286 ################################################### hist(qual.results[,"median.quality"], main="Median Genotype Quality Scores of Samples", xlab="Median Quality") ################################################### ### code chunk number 49: DataCleaning.Rnw:1327-1337 ################################################### blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata") blNC <- NcdfIntensityReader(blfile) blData <- IntensityData(blNC) nbins <- rep(12, 3) slidingBAF12 <- sdByScanChromWindow(blData, genoData, nbins=nbins) length(slidingBAF12) length(slidingBAF12) dim(slidingBAF12[[1]]) dim(slidingBAF12[[2]]) ################################################### ### code chunk number 50: DataCleaning.Rnw:1345-1349 ################################################### sds.chr <- meanSdByChromWindow(slidingBAF12, scanAnnot$sex) sds.chr[["21"]] sds.chr[["X"]] ################################################### ### code chunk number 51: DataCleaning.Rnw:1356-1364 ################################################### res12bin4sd <- findBAFvariance(sds.chr, slidingBAF12, scanAnnot$sex, sd.threshold=4) nrow(res12bin4sd) head(res12bin4sd) sum(is.na(res12bin4sd)) table(res12bin4sd[, 2]) all(res12bin4sd[res12bin4sd[, "sex"] == "F", "scanID"] %in% scanAnnot$scanID[scanAnnot$sex == "F"]) ################################################### ### code chunk number 52: DataCleaning.Rnw:1373-1382 ################################################### scanID <- as.integer(res12bin4sd[, "scanID"]) chrom <- as.integer(res12bin4sd[, "chromosome"]) chrom[res12bin4sd[, "chromosome"] == "X"] <- 23 bincode <- paste("Bin", res12bin4sd[, "bin"], sep = " ") sexcode <- paste("Sex", res12bin4sd[, "sex"], sep = " ") code <- paste(bincode, sexcode, sep = ", ") plotind <- 1 chromIntensityPlot(blData, scanID[plotind], chrom[plotind], code=code[plotind], type="BAF") ################################################### ### code chunk number 53: DataCleaning.Rnw:1387-1388 ################################################### close(blData) ################################################### ### code chunk number 54: DataCleaning.Rnw:1418-1422 ################################################### miss <- missingGenotypeByScanChrom(genoData) miss.rate <- t(apply(miss$missing.counts, 1, function(x) { x / miss$snps.per.chr})) miss.rate <- as.data.frame(miss.rate) ################################################### ### code chunk number 55: DataCleaning.Rnw:1431-1434 ################################################### cols <- names(miss.rate) %in% c(1:22, "X", "XY") boxplot(miss.rate[,cols], main="Missingness by Chromosome", ylab="Proportion Missing", xlab="Chromosome") ################################################### ### code chunk number 56: DataCleaning.Rnw:1441-1444 ################################################### boxplot(miss.rate$X ~ scanAnnot$sex, main="X Chromosome Missingness by Sex", ylab="Proportion Missing") ################################################### ### code chunk number 57: DataCleaning.Rnw:1456-1468 ################################################### # Calculate heterozygosity by scan by chromosome het.results <- hetByScanChrom(genoData) close(genoData) # Ensure heterozygosity results are ordered correctly allequal(scanAnnot$scanID, rownames(het.results)) # Write autosomal and X chr heterozygosity to sample annot scanAnnot$het.A <- het.results[,"A"] scanAnnot$het.X <- het.results[,"X"] varMetadata(scanAnnot)["het.A", "labelDescription"] <- "fraction of heterozygotes for autosomal SNPs" varMetadata(scanAnnot)["het.X", "labelDescription"] <- "fraction of heterozygotes for X chromosome SNPs" ################################################### ### code chunk number 58: DataCleaning.Rnw:1478-1480 ################################################### boxplot(scanAnnot$het.A ~ scanAnnot$race, main="Autosomal Heterozygosity") ################################################### ### code chunk number 59: DataCleaning.Rnw:1488-1491 ################################################### male <- scanAnnot$sex == "M" boxplot(scanAnnot$het.A[male] ~ scanAnnot$race[male], main="X Chromosome Heterozygosity in Females") ################################################### ### code chunk number 60: DataCleaning.Rnw:1533-1542 ################################################### library(GWASTools) library(GWASdata) qxyfile <- system.file("extdata", "affy_qxy.nc", package="GWASdata") intenNC <- NcdfIntensityReader(qxyfile) inten.by.chrom <- meanIntensityByScanChrom(intenNC) close(intenNC) length(inten.by.chrom) names(inten.by.chrom) ################################################### ### code chunk number 61: DataCleaning.Rnw:1550-1564 ################################################### mninten <- inten.by.chrom[[1]] # mean intensities dim(mninten) data(affyScanADF) # ScanAnnotationDataFrame scanAnnot <- affyScanADF # Check to be sure sample ordering is consistent allequal(scanAnnot$scanID, rownames(mninten)) # Assign each gender a color xcol <- rep(NA, nrow(scanAnnot)) xcol[scanAnnot$sex == "M"] <- "blue" xcol[scanAnnot$sex == "F"] <- "red" data(affySnpADF) snpAnnot <- affySnpADF nx <- sum(snpAnnot$chromosome == 23) ny <- sum(snpAnnot$chromosome == 25) ################################################### ### code chunk number 62: DataCleaning.Rnw:1574-1609 ################################################### #All intensities x1 <-mninten[,"X"]; y1 <- mninten[,"Y"] main1 <- "Mean X vs \nMean Y Chromosome Intensity" #Het on X vs X intensity x2 <- mninten[,"X"]; y2 <- scanAnnot$het.X main2 <- "Mean X Chromosome vs Mean X Chromosome Heterozygosity" # Het on X vs Y intensity y3 <- mninten[,"Y"]; x3 <- scanAnnot$het.X main3 <- "Mean X Chromosome Heterozygosity vs Mean Y Chromosome Intensity" # X vs A het x4 <- scanAnnot$het.A[scanAnnot$sex == "F"] y4 <- scanAnnot$het.X[scanAnnot$sex == "F"] main4 <- "Mean Autosomal Heterozygosity vs Mean X Chromosome Heterozygosity" cols <- c("blue","red") mf <- c("male", "female") xintenlab <- paste("X intensity (n=", nx, ")", sep="") yintenlab <- paste("Y intensity (n=", ny, ")", sep="") pdf("DataCleaning-gender.pdf") par(mfrow=c(2,2)) plot(x1, y1, xlab=xintenlab, ylab=yintenlab, main=main1, col=xcol, cex.main=0.8) legend("topright",mf,col=cols,pch=c(1,1),cex=0.8) plot(x2, y2, col=xcol, xlab=xintenlab, ylab="X heterozygosity", main=main2, cex.main=0.8) legend("topleft", mf, col=cols, pch=c(1,1),cex=0.8) plot(x3, y3, col=xcol, ylab=yintenlab, xlab="X heterozygosity", main=main3, cex.main=0.8) legend("topright", mf, col=cols, pch=c(1,1),cex=0.8) plot(x4,y4, col="red", xlab="Autosomal heterozygosity", ylab="X heterozygosity", main=main4, cex.main=0.8) legend("topleft", mf, col=cols, pch=c(1,1),cex=0.8) dev.off() ################################################### ### code chunk number 63: DataCleaning.Rnw:1652-1658 ################################################### library(gdsfmt) library(SNPRelate) ncfile <- system.file("extdata", "affy_geno.nc", package="GWASdata") gdsfile <- "broad.hapmap.geno.gds" convertNcdfGds(ncfile, gdsfile) ################################################### ### code chunk number 64: DataCleaning.Rnw:1666-1670 ################################################### snp.auto <- pData(snpAnnot)[snpAnnot$chromosome < 23, ] snp.auto.nonmiss <- snp.auto[snp.auto$missing.n1 < 0.05, ] snp.select <- snp.auto.nonmiss$snpID length(snp.select) ################################################### ### code chunk number 65: DataCleaning.Rnw:1679-1687 ################################################### sample.sel <- scanAnnot$scanID[scanAnnot$race == "CEU"] length(sample.sel) gdsobj <- openfn.gds(gdsfile) ibd <- snpgdsIBDMLE(gdsobj, sample.id=sample.sel, snp.id=snp.select, method="EM") closefn.gds(gdsobj) dim(ibd$k0) ibd$k0[1:5,1:5] ################################################### ### code chunk number 66: DataCleaning.Rnw:1697-1703 ################################################### samp <- pData(scanAnnot)[scanAnnot$race == "CEU", c("family", "subjectID", "father", "mother", "sex")] dim(samp) names(samp) <- c("family", "individ", "father", "mother", "sex") samp[1:5, ] pedigreeClean(samp) ################################################### ### code chunk number 67: DataCleaning.Rnw:1711-1715 ################################################### dups <- pedigreeFindDuplicates(samp) dups uni.samp <- pedigreeDeleteDuplicates(samp, dups$dups.match) fc <- pedigreeCheck(uni.samp) ################################################### ### code chunk number 68: DataCleaning.Rnw:1727-1731 ################################################### length(fc$one.person) length(fc$mismatch.sex) length(fc$impossible.related) length(fc$subfamilies.ident) ################################################### ### code chunk number 69: DataCleaning.Rnw:1737-1745 ################################################### subf <- fc$subfamilies.ident head(subf) table(subf$family) subf.ids <- subf$individ[subf$subfamily == 2] newfam <- uni.samp$individ %in% subf.ids uni.samp$family[newfam] <- uni.samp$family[newfam] + 10000 table(uni.samp$family) pedigreeCheck(uni.samp) ################################################### ### code chunk number 70: DataCleaning.Rnw:1755-1760 ################################################### rels <- pedigreePairwiseRelatedness(uni.samp) length(rels$inbred.fam) relprs <- rels$relativeprs relprs[1:5,] table(relprs$relation) ################################################### ### code chunk number 71: DataCleaning.Rnw:1767-1783 ################################################### relprs$s12 <- paste(relprs$Individ1, relprs$Individ2) ibd$subjectID <- scanAnnot$subjectID[is.element( scanAnnot$scanID, ibd$sample.id)] sample1.temp <- rep(ibd$subjectID, length(ibd$subjectID)) sample1.temp <- matrix(sample1.temp, nrow = dim(ibd$k0)[1]) sample2.temp <- t(sample1.temp) ibd$sample1.subjectID <- as.array(sample1.temp) ibd$sample2.subjectID <- as.array(sample2.temp) ibd$s12 <- paste(ibd$sample1.subjectID, ibd$sample2.subjectID) ibd$s21 <- paste(ibd$sample2.subjectID, ibd$sample1.subjectID) ibd$rel <- array("U", length(ibd$s12)) ibd$rel[is.element(ibd$s12, relprs$s12[is.element(relprs$relation, "PO")]) | is.element(ibd$s21, relprs$s12[is.element( relprs$relation, "PO")])] <- "PO" ibd$rel[ibd$sample1.subjectID == ibd$sample2.subjectID] <- "Dup" table(ibd$rel, exclude=NULL) ################################################### ### code chunk number 72: DataCleaning.Rnw:1792-1794 ################################################### ibdPlot(ibd$k0, ibd$k1, relation=ibd$rel, main="HapMap CEU IBD Estimates \nEM Algorithm") ################################################### ### code chunk number 73: DataCleaning.Rnw:1816-1846 ################################################### # Allele frequency calculation in YRI samples # open the NetCDF file and create a GenotypeData object nc <- NcdfGenotypeReader(ncfile) genoData <- GenotypeData(nc, scanAnnot=scanAnnot, snpAnnot=snpAnnot) # sample selection - unduplicated YRI samples sample.excl <- scanAnnot$scanID[scanAnnot$race != "YRI" | scanAnnot$duplicated] length(sample.excl) afreq <- alleleFrequency(genoData, scan.exclude=sample.excl) close(genoData) # Add allele frequency to SNP annotation afreq <- afreq[,"all"] # males and females combined snpAnnot$A.freq.yri <- afreq varMetadata(snpAnnot)["A.freq.yri", "labelDescription"] <- "frequency of allele A in YRI subjects" # Initial selection: autosomal SNPs with low missing call rate and MAF>0 init.sel <- snpAnnot$missing.n1 < 0.05 & snpAnnot$chromosome < 23 & afreq > 0 & afreq < 1 sum(init.sel) # Call the function to select SNPs no closer than 15000 bases set.seed(1001) n <- 15000 rsnp2 <- apartSnpSelection(snpAnnot$chromosome, snpAnnot$position, min.dist=n, init.sel=init.sel) sum(rsnp2) # Add the SNP selection to the SNP annotation file snpAnnot$r15kb <- rsnp2 varMetadata(snpAnnot)["r15kb", "labelDescription"] <- paste( "Logical indicator for autosomal SNPs with missing.n1<0.05", "and YRI MAF>0, at least 15 kb apart") ################################################### ### code chunk number 74: DataCleaning.Rnw:1852-1860 ################################################### sample.sel <- scanAnnot$scanID[scanAnnot$race == "YRI"] snp.select <- snpAnnot$snpID[snpAnnot$r15kb] gdsobj <- openfn.gds(gdsfile) ibd <- snpgdsIBDMoM(gdsobj, sample.id=sample.sel, snp.id=snp.select) closefn.gds(gdsobj) dim(ibd$k0) ibd$k0[1:5,1:5] ################################################### ### code chunk number 75: DataCleaning.Rnw:1870-1876 ################################################### samp <- pData(scanAnnot)[scanAnnot$race == "YRI", c("family", "subjectID", "father", "mother", "sex")] dim(samp) names(samp) <- c("family", "individ", "father", "mother", "sex") samp[1:5, ] pedigreeClean(samp) ################################################### ### code chunk number 76: DataCleaning.Rnw:1886-1891 ################################################### dups <- pedigreeFindDuplicates(samp) dups uni.samp <- pedigreeDeleteDuplicates(samp, dups$dups.match) fc <- pedigreeCheck(uni.samp) (subf <- fc$subfamilies.ident) ################################################### ### code chunk number 77: DataCleaning.Rnw:1897-1900 ################################################### uni.samp <- uni.samp[!(uni.samp$family %in% subf$family),] table(uni.samp$family) pedigreeCheck(uni.samp) ################################################### ### code chunk number 78: DataCleaning.Rnw:1906-1927 ################################################### rels <- pedigreePairwiseRelatedness(uni.samp) length(rels$inbred.fam) relprs <- rels$relativeprs relprs[1:5,] table(relprs$relation) relprs$s12 <- paste(relprs$Individ1, relprs$Individ2) ibd$subjectID <- scanAnnot$subjectID[is.element( scanAnnot$scanID, ibd$sample.id)] sample1.temp <- rep(ibd$subjectID, length(ibd$subjectID)) sample1.temp <- matrix(sample1.temp, nrow = dim(ibd$k0)[1]) sample2.temp <- t(sample1.temp) ibd$sample1.subjectID <- as.array(sample1.temp) ibd$sample2.subjectID <- as.array(sample2.temp) ibd$s12 <- paste(ibd$sample1.subjectID, ibd$sample2.subjectID) ibd$s21 <- paste(ibd$sample2.subjectID, ibd$sample1.subjectID) ibd$rel <- array("U", length(ibd$s12)) ibd$rel[is.element(ibd$s12, relprs$s12[is.element(relprs$relation, "PO")]) | is.element(ibd$s21, relprs$s12[is.element( relprs$relation, "PO")])] <- "PO" ibd$rel[ibd$sample1.subjectID == ibd$sample2.subjectID] <- "Dup" table(ibd$rel, exclude=NULL) ################################################### ### code chunk number 79: DataCleaning.Rnw:1938-1940 ################################################### ibdPlot(ibd$k0, ibd$k1, relation=ibd$rel, main="HapMap YRI IBD Estimates \nPLINK MOM Approach") ################################################### ### code chunk number 80: DataCleaning.Rnw:1969-1979 ################################################### snp.sel <- snpAnnot$snpID[snpAnnot$missing.n1 < 0.05 & snpAnnot$chromosome < 23] sample.sel <- scanAnnot$scanID[scanAnnot$duplicated == FALSE] gdsobj <- openfn.gds(gdsfile) pca <- snpgdsPCA(gdsobj, sample.id=sample.sel, snp.id=snp.sel) closefn.gds(gdsobj) names(pca) length(pca$eigenval) dim(pca$eigenvect) ################################################### ### code chunk number 81: DataCleaning.Rnw:1985-1993 ################################################### # Calculate the percentage of variance explained # by each principal component. pc.frac <- pca$eigenval/sum(pca$eigenval) lbls <- paste("EV", 1:4, "\n", format(pc.frac[1:4], digits=2), sep="") samp <- pData(scanAnnot)[match(pca$sample.id, scanAnnot$scanID),] cols <- rep(NA, nrow(samp)) cols[samp$race == "CEU"] <- "black" cols[samp$race == "YRI"] <- "red" ################################################### ### code chunk number 82: DataCleaning.Rnw:1998-2000 ################################################### pairs(pca$eigenvect[,1:4], col=cols, labels=lbls, main = "CEU: black, YRI: red") ################################################### ### code chunk number 83: DataCleaning.Rnw:2017-2030 ################################################### par.coord <- pca$eigenvect rangel <- apply(par.coord, 2, function(x) range(x)[1]) rangeh <- apply(par.coord, 2, function(x) range(x)[2]) std.coord <- par.coord for (i in 1:14) std.coord[,i] <- (par.coord[,i] - rangel[i])/(rangeh[i]-rangel[i]) plot(c(0,15), c(0,1), type = 'n', axes = FALSE, ylab = "", xlab = "", main = "Parallel Coordinates Plot CEU: black, YRI: red") for (j in 1:13) for (i in sample(1:nrow(std.coord)) ) lines(c(j,j+1), std.coord[i,c(j,j+1)], col=cols[i], lwd=0.25) axis(1, at = 1:14, labels = paste("PC",1:14, sep = ".")) ################################################### ### code chunk number 84: DataCleaning.Rnw:2038-2039 ################################################### file.remove(gdsfile) ################################################### ### code chunk number 85: DataCleaning.Rnw:2060-2065 ################################################### princomp <- as.data.frame(pca$eigenvect) samples.nodup <- pData(scanAnnot)[!scanAnnot$duplicated,] princomp$scanID <- as.factor(samples.nodup$scanID) princomp$case.ctrl.status <- as.factor(samples.nodup$status) princomp$race <- as.factor(samples.nodup$race) ################################################### ### code chunk number 86: DataCleaning.Rnw:2072-2079 ################################################### pc.percent <- 100 * pca$eigenval[1:32]/sum(pca$eigenval) pc.percent lbls <- paste("EV", 1:3, "\n", format(pc.percent[1:3], digits=2), "%", sep="") table(samples.nodup$status) cols <- rep(NA, nrow(samples.nodup)) cols[samples.nodup$status == 1] <- "green" cols[samples.nodup$status == 0] <- "red" ################################################### ### code chunk number 87: DataCleaning.Rnw:2087-2089 ################################################### pairs(pca$eigenvect[,1:3], col=cols, labels=lbls, main = "First Three EVs by Case-Control Status") ################################################### ### code chunk number 88: DataCleaning.Rnw:2102-2104 ################################################### boxplot(princomp[, 1] ~ princomp$case.ctrl.status, ylab = "PC1", main = "PC1 vs. Case-control Status") ################################################### ### code chunk number 89: DataCleaning.Rnw:2113-2115 ################################################### boxplot(princomp[, 2] ~ princomp$case.ctrl.status, ylab = "PC2", main = "PC2 vs. Case-control Status") ################################################### ### code chunk number 90: DataCleaning.Rnw:2124-2126 ################################################### boxplot(princomp[, 3] ~ princomp$case.ctrl.status, ylab = "PC3", main = "PC3 vs. Case-control Status") ################################################### ### code chunk number 91: DataCleaning.Rnw:2133-2142 ################################################### aov.p1 <- aov(princomp[,1] ~ princomp$race * princomp$case.ctrl.status, princomp) summary(aov.p1) aov.p2 <- aov(princomp[,2] ~ princomp$race * princomp$case.ctrl.status, princomp) summary(aov.p2) aov.p3 <- aov(princomp[,3] ~ princomp$race * princomp$case.ctrl.status, princomp) summary(aov.p3) ################################################### ### code chunk number 92: DataCleaning.Rnw:2163-2165 ################################################### lm.all <- lm(scanAnnot$missing.e1 ~ scanAnnot$status) summary(aov(lm.all)) ################################################### ### code chunk number 93: DataCleaning.Rnw:2170-2172 ################################################### boxplot(scanAnnot$missing.e1 ~ scanAnnot$status, ylab = "Mean missing call rate", main="Mean missing call rate by case status") ################################################### ### code chunk number 94: DataCleaning.Rnw:2197-2211 ################################################### library(GWASTools) library(GWASdata) data(illuminaScanADF) # ScanAnnotationDataFrame scanAnnot <- illuminaScanADF data(illuminaSnpADF) # SnpAnnotationDataFrame snpAnnot <- illuminaSnpADF blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata") blnc <- NcdfIntensityReader(blfile) blData <- IntensityData(blnc, scanAnnot=scanAnnot, snpAnnot=snpAnnot) genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") genonc <- NcdfGenotypeReader(genofile) genoData <- GenotypeData(genonc, scanAnnot=scanAnnot, snpAnnot=snpAnnot) ################################################### ### code chunk number 95: DataCleaning.Rnw:2216-2219 ################################################### baf.sd <- sdByScanChromWindow(blData, genoData, var="BAlleleFreq") med.baf.sd <- medianSdOverAutosomes(baf.sd) low.qual.ids <- med.baf.sd$scanID[med.baf.sd$med.sd > 0.05] ################################################### ### code chunk number 96: DataCleaning.Rnw:2224-2243 ################################################### chrom <- snpAnnot$chromosome pos <- snpAnnot$position build <- getAttribute(blnc, attname="genome_build") if (build == 36) build <- "hg18" if (build == 37) build <- "hg19" hla.df <- get(data(list=paste("HLA", build, sep="."))) hla <- chrom == "6" & pos >= hla.df$start.base & pos <= hla.df$end.base xtr.df <- get(data(list=paste("pseudoautosomal", build, sep="."))) xtr <- chrom == "X" & pos >= xtr.df["X.XTR", "start.base"] & pos <= xtr.df["X.XTR", "end.base"] centromeres <- get(data(list=paste("centromeres", build, sep="."))) gap <- rep(FALSE, nrow(snpAnnot)) for (i in 1:nrow(centromeres)) { ingap <- chrom == centromeres$chrom[i] & pos > centromeres$left.base[i] & pos < centromeres$right.base[i] gap <- gap | ingap } ignore <- snpAnnot$missing.n1 == 1 #ignore includes intensity-only and failed snps snp.exclude <- ignore | hla | xtr | gap snp.ok <- snpAnnot$snpID[!snp.exclude] ################################################### ### code chunk number 97: DataCleaning.Rnw:2249-2254 ################################################### scan.ids <- scanAnnot$scanID[1:10] chrom.ids <- 21:23 baf.seg <- anomSegmentBAF(blData, genoData, scan.ids=scan.ids, chrom.ids=chrom.ids, snp.ids=snp.ok, verbose=FALSE) head(baf.seg) ################################################### ### code chunk number 98: DataCleaning.Rnw:2259-2265 ################################################### baf.anom <- anomFilterBAF(blData, genoData, segments=baf.seg, snp.ids=snp.ok, centromere=centromeres, low.qual.ids=low.qual.ids, verbose=FALSE) names(baf.anom) baf.filt <- baf.anom$filtered head(baf.filt) ################################################### ### code chunk number 99: DataCleaning.Rnw:2280-2286 ################################################### loh.anom <- anomDetectLOH(blData, genoData, scan.ids=scan.ids, chrom.ids=chrom.ids, snp.ids=snp.ok, known.anoms=baf.filt, verbose=FALSE) names(loh.anom) loh.filt <- loh.anom$filtered head(loh.filt) ################################################### ### code chunk number 100: DataCleaning.Rnw:2297-2311 ################################################### # create required data frame baf.filt$method <- "BAF" if (!is.null(loh.filt)) { loh.filt$method <- "LOH" cols <- intersect(names(baf.filt), names(loh.filt)) anoms <- rbind(baf.filt[,cols], loh.filt[,cols]) } else { anoms <- baf.filt } anoms$anom.id <- 1:nrow(anoms) stats <- anomSegStats(blData, genoData, snp.ids=snp.ok, anom=anoms, centromere=centromeres) names(stats) ################################################### ### code chunk number 101: DataCleaning.Rnw:2317-2320 ################################################### snp.not.ok <- snpAnnot$snpID[snp.exclude] anomStatsPlot(blData, genoData, anom.stats=stats[1,], snp.ineligible=snp.not.ok, centromere=centromeres) ################################################### ### code chunk number 102: DataCleaning.Rnw:2333-2335 ################################################### lrr.sd <- sdByScanChromWindow(blData, var="LogRRatio", incl.hom=TRUE) med.lrr.sd <- medianSdOverAutosomes(lrr.sd) ################################################### ### code chunk number 103: DataCleaning.Rnw:2341-2343 ################################################### baf.seg.info <- baf.anom$seg.info loh.seg.info <- loh.anom$base.info[,c("scanID", "chromosome", "num.segs")] ################################################### ### code chunk number 104: DataCleaning.Rnw:2353-2358 ################################################### snpAnnot$eligible <- !snp.exclude baf.low.qual <- anomIdentifyLowQuality(snpAnnot, med.baf.sd, baf.seg.info, sd.thresh=0.1, sng.seg.thresh=0.0008, auto.seg.thresh=0.0001) loh.low.qual <- anomIdentifyLowQuality(snpAnnot, med.lrr.sd, loh.seg.info, sd.thresh=0.25, sng.seg.thresh=0.0048, auto.seg.thresh=0.0006) ################################################### ### code chunk number 105: DataCleaning.Rnw:2363-2365 ################################################### close(blData) close(genoData) ################################################### ### code chunk number 106: DataCleaning.Rnw:2392-2398 ################################################### library(GWASTools) library(GWASdata) data(affyScanADF) # ScanAnnotationDataFrame scanAnnot <- affyScanADF scan.excl <- scanAnnot$scanID[scanAnnot$missing.e1 >= 0.05] length(scan.excl) ################################################### ### code chunk number 107: DataCleaning.Rnw:2409-2444 ################################################### data(affySnpADF) # SnpAnnotationDataFrame snpAnnot <- affySnpADF snp.excl <- snpAnnot$snpID[snpAnnot$missing.n1 == 1] length(snp.excl) genofile <- system.file("extdata", "affy_geno.nc", package="GWASdata") genoNC <- NcdfGenotypeReader(genofile) genoData <- GenotypeData(genoNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot) dupdisc <- duplicateDiscordance(genoData, subjName.col="subjectID", scan.exclude=scan.excl, snp.exclude=snp.excl) names(dupdisc) head(dupdisc$discordance.by.snp) length(dupdisc$discordance.by.subject) dupdisc$discordance.by.subject[[1]] # each entry is a 2x2 matrix, but only one value of each # is important since these are all pairs npair <- length(dupdisc$discordance.by.subject) disc.subj <- rep(NA, npair) subjID <- rep(NA, npair) race <- rep(NA, npair) for (i in 1:npair) { disc.subj[i] <- dupdisc$discordance.by.subject[[i]][1,2] subjID[i] <- names(dupdisc$discordance.by.subject)[i] race[i] <- scanAnnot$race[scanAnnot$subjectID == subjID[i]][1] } dat <- data.frame(subjID=subjID, disc=disc.subj, pop=race, stringsAsFactors=FALSE) summary(dat$disc) # Assign colors for the duplicate samples based on population group. dat$col <- NA dat$col[dat$pop == "CEU"] <- "black" dat$col[dat$pop == "YRI"] <- "red" table(dat$col, exclude=NULL) dat <- dat[order(dat$disc),] dat$rank <- 1:npair ################################################### ### code chunk number 108: DataCleaning.Rnw:2449-2456 ################################################### # Plot the sample discordance rates color coded by ethnicity. plot(dat$disc, dat$rank, xlab="Discordance rate between duplicate samples", ylab="rank", col=dat$col, main="Duplicate Sample Discordance by Continental Ancestry") legend("topleft", unique(dat$pop), pch=rep(1,2), col=unique(dat$col)) ################################################### ### code chunk number 109: DataCleaning.Rnw:2485-2498 ################################################### men.list <- mendelList(scanAnnot$family, scanAnnot$subjectID, scanAnnot$father, scanAnnot$mother, scanAnnot$sex, scanAnnot$scanID) res <- mendelListAsDataFrame(men.list) head(res) dim(res) # Only want to use SNPs with missing.n1 < 0.05 snp.excl <- snpAnnot$snpID[snpAnnot$missing.n1 >= 0.05] length(snp.excl) mend <- mendelErr(genoData, men.list, snp.exclude=snp.excl) names(mend) head(mend$trios) names(mend$snp) ################################################### ### code chunk number 110: DataCleaning.Rnw:2513-2516 ################################################### # Calculate the error rate err <- mend$snp$error.cnt / mend$snp$check.cnt table(err == 0, exclude=NULL) ################################################### ### code chunk number 111: DataCleaning.Rnw:2523-2525 ################################################### plot(err, rank(err), xlab="Error Rate (fraction)", ylab="rank", main="Mendelian Error Rate per SNP, ranked") ################################################### ### code chunk number 112: DataCleaning.Rnw:2535-2550 ################################################### fam <- mend$snp$error.cnt n <- mend$snp$check.cnt fam[is.na(err)] <- NA table(is.na(fam), exclude=NULL) summary(fam) # SNPs with errors length(fam[n > 0 & fam > 0]) # SNPs for which more than one family has an error length(fam[n > 0 & fam > 1]) length(fam[n > 0 & fam > 2]) # Get the SNPs with valid trios for error detection val <- length(fam[n > 0]) noerr <- length(fam[n > 0 & fam == 0]) # Divide to get fraction with no errors noerr / val ################################################### ### code chunk number 113: DataCleaning.Rnw:2557-2573 ################################################### # Write the error rates to the SNP table snp.sel <- match(names(mend$snp$error.cnt), snpAnnot$snpID) snpAnnot$mendel.err.count[snp.sel] <- mend$snp$error.cnt snpAnnot$mendel.err.sampsize[snp.sel] <- mend$snp$check.cnt allequal(snpAnnot$snpID, sort(snpAnnot$snpID)) # The high number of NA values is due to the filtering out of SNPs # before the Mendelian error rate calculation sum(is.na(snpAnnot$mendel.err.count)) sum(is.na(snpAnnot$mendel.err.sampsize)) # Save the updated SNP annotation table with # the new Mendelian error data varMetadata(snpAnnot)["mendel.err.count", "labelDescription"] <- paste("number of Mendelian errors detected in trios averaged over", "multiple combinations of replicate genotyping instances") varMetadata(snpAnnot)["mendel.err.sampsize", "labelDescription"] <- "number of opportunities to detect Mendelian error in trios" ################################################### ### code chunk number 114: DataCleaning.Rnw:2581-2599 ################################################### # Get a vector of SNPs to check snp <- pData(snpAnnot) snp$err.rate <- snp$mendel.err.count / snp$mendel.err.sampsize snp <- snp[order(snp$err.rate, decreasing=TRUE),] snp <- snp[1:3,] xyfile <- system.file("extdata", "affy_qxy.nc", package="GWASdata") xyNC <- NcdfIntensityReader(xyfile) xyData <- IntensityData(xyNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot) pdf(file="DataCleaning-mendel.pdf") par(mfrow = c(3,3)) mtxt <- paste("SNP", snp$rsID, "\nMendelian Error Rate", format(snp$err.rate, digits=5)) genoClusterPlotByBatch(xyData, genoData, snpID=snp$snpID, main.txt=mtxt, plot.type="XY", batchVar="plate", cex.main=0.9) dev.off() close(xyData) ################################################### ### code chunk number 115: DataCleaning.Rnw:2622-2641 ################################################### # Calculate the fraction of SNPs with an error for each trio trios <- mend$trios trios$Mend.err <- trios$Men.err.cnt/trios$Men.cnt summary(trios$Mend.err) # Start by pulling out the vectors needed from `trios' tmp <- trios[, c("fam.id", "Mend.err")]; dim(tmp) # Change fam.id to match the sample annotation column name names(tmp) <- c("family", "Mend.err.rate.fam") # Merge the variables into the sample annotation file scanAnnot$mend.err.rate.fam <- NA for(i in 1:nrow(tmp)) { ind <- which(is.element(scanAnnot$family,tmp$family[i])) scanAnnot$mend.err.rate.fam[ind] <- tmp$Mend.err.rate.fam[i] } allequal(scanAnnot$scanID, sort(scanAnnot$scanID)) head(scanAnnot$mend.err.rate.fam) varMetadata(scanAnnot)["mend.err.rate.fam", "labelDescription"] <- "Mendelian error rate per family" ################################################### ### code chunk number 116: DataCleaning.Rnw:2649-2660 ################################################### # Get the families that have non-NA values for the family # Mendelian error rate fams <- pData(scanAnnot)[!is.na(scanAnnot$mend.err.rate.fam) & !duplicated(scanAnnot$family), c("family", "mend.err.rate.fam", "race")] dim(fams) table(fams$race,exclude=NULL) # Assign colors for the different ethnicities in these families pcol <- rep(NA, nrow(fams)) pcol[fams$race == "CEU"] <- "black" pcol[fams$race == "YRI"] <- "red" ################################################### ### code chunk number 117: DataCleaning.Rnw:2665-2671 ################################################### plot(fams$mend.err.rate.fam*100, rank(fams$mend.err.rate.fam), main="Mendelian Error rate per Family, ranked", xlab="Mendelian error rate per family (percent)", ylab="rank", col=pcol) legend("bottomright", c("CEU", "YRI"), pch=c(1,1), col=c("black", "red")) ################################################### ### code chunk number 118: DataCleaning.Rnw:2697-2706 ################################################### head(pData(scanAnnot)[,c("father", "mother")]) nonfounders <- scanAnnot$father != 0 & scanAnnot$mother != 0 table(nonfounders) scan.excl <- scanAnnot$scanID[scanAnnot$race != "CEU" | nonfounders | scanAnnot$duplicated] length(scan.excl) hwe <- gwasExactHW(genoData, scan.exclude=scan.excl) close(genoData) ################################################### ### code chunk number 119: DataCleaning.Rnw:2717-2728 ################################################### names(hwe) dim(hwe) # Check on sample sizes for autosomes and X chromosome hwe$N <- hwe$nAA + hwe$nAB + hwe$nBB summary(hwe$N[is.element(hwe$chromosome,1:22)]) summary(hwe$N[is.element(hwe$chromosome,23)]) hwe$p.value[1:10] sum(is.na(hwe$p.value[hwe$chromosome == 24])) # XY sum(is.na(hwe$p.value[hwe$chromosome == 23])) # X hwe$MAF[1:10] hwe[1:3, c("nAA", "nAB", "nBB")] ################################################### ### code chunk number 120: DataCleaning.Rnw:2735-2736 ################################################### summary(hwe$f) ################################################### ### code chunk number 121: DataCleaning.Rnw:2741-2743 ################################################### hist(hwe$f, main="Histogram of the Inbreeding Coefficient For CEU Samples", xlab="Inbreeding Coefficient") ################################################### ### code chunk number 122: DataCleaning.Rnw:2749-2752 ################################################### # Check the MAF of those SNPs with f=1 chkf <- hwe[!is.na(hwe$f) & hwe$f==1,]; dim(chkf) summary(chkf$MAF) ################################################### ### code chunk number 123: DataCleaning.Rnw:2759-2773 ################################################### idx <- which(hwe$MAF == 0); length(idx) hwe.0 <- hwe[-idx,]; dim(hwe.0) # Check to makes sure that the correct number of SNPs were removed length(hwe$p.value) - length(hwe.0$p.value) # Only keep the autosomal SNPs for first plot pval <- hwe.0$p.value[is.element(hwe.0$chromosome, 1:22)] length(pval) pval <- pval[!is.na(pval)] length(pval) # X chromosome SNPs for plot 2 pval.x <- hwe.0$p.value[is.element(hwe.0$chromosome, 23)] length(pval.x) pval.x <- pval.x[!is.na(pval.x)] length(pval.x) ################################################### ### code chunk number 124: DataCleaning.Rnw:2775-2786 ################################################### pdf(file = "DataCleaning-hwe.pdf") par(mfrow=c(2,2)) qqPlot(pval=pval, truncate = FALSE, main="Autosomes, all") qqPlot(pval=pval, truncate = TRUE, main="Autosomes, truncated") qqPlot(pval=pval.x, truncate = FALSE, main="X chromosome, all") qqPlot(pval=pval.x, truncate = TRUE, main="X chromosome, truncated") dev.off() ################################################### ### code chunk number 125: DataCleaning.Rnw:2800-2803 ################################################### plot(hwe.0$MAF, -log10(hwe.0$p.value), xlab="Minor Allele Frequency", ylab="-log(p-value)", main="Minor Allele Frequency vs\nP-value") ################################################### ### code chunk number 126: DataCleaning.Rnw:2839-2851 ################################################### library(GWASTools) library(GWASdata) data(affyScanADF) # ScanAnnotationDataFrame scanAnnot <- affyScanADF genofile <- system.file("extdata", "affy_geno.nc", package="GWASdata") genoNC <- NcdfGenotypeReader(genofile) genoData <- GenotypeData(genoNC, scanAnnot=scanAnnot) assoc.file <- "assoc" assocTestRegression(genoData, outcome="status", covar.list=c(""), ivar.list=c(""), model.type="logistic", robust=TRUE, chromosome.set=c(24:26), outfile=assoc.file,) ################################################### ### code chunk number 127: DataCleaning.Rnw:2870-2873 ################################################### assoc <- getobj(paste(assoc.file, ".model.1.additive.chr.24_26.RData", sep="")) names(assoc) ################################################### ### code chunk number 128: DataCleaning.Rnw:2878-2880 ################################################### qqPlot(pval=assoc$model.1.additive.LR.pval.G, truncate=TRUE, main="QQ Plot of Likelihood Ratio Test p-values") ################################################### ### code chunk number 129: DataCleaning.Rnw:2897-2901 ################################################### data(affySnpADF) snpAnnot <- affySnpADF chrom <- getChromosome(snpAnnot) chrom.sel <- chrom %in% 24:26 ################################################### ### code chunk number 130: DataCleaning.Rnw:2906-2909 ################################################### manhattanPlot(assoc$model.1.additive.LR.pval.G, chromosome=chrom[chrom.sel], chrom.labels=c("XY","Y","M")) ################################################### ### code chunk number 131: DataCleaning.Rnw:2929-2947 ################################################### # Identify SNPs with lowest p-values snp <- pData(snpAnnot)[chrom.sel, c("snpID", "rsID")] allequal(snp$snpID, assoc$snpID) snp$pval <- assoc$model.1.additive.LR.pval.G snp <- snp[order(snp$pval),] snp <- snp[1:9,] xyfile <- system.file("extdata", "affy_qxy.nc", package="GWASdata") xyNC <- NcdfIntensityReader(xyfile) xyData <- IntensityData(xyNC, snpAnnot=snpAnnot, scanAnnot=scanAnnot) pdf(file="DataCleaning-cluster.pdf") par(mfrow = c(3,3)) mtxt <- paste("SNP", snp$rsID, "\np =", format(snp$pval, digits=4)) genoClusterPlot(xyData, genoData, snpID=snp$snpID, main.txt=mtxt, plot.type="XY") dev.off() close(xyData) close(genoData) ################################################### ### code chunk number 132: DataCleaning.Rnw:2961-2962 ################################################### unlink(paste(assoc.file, "*.RData", sep=""))