## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(SNPfiltR) library(vcfR) ## ----------------------------------------------------------------------------- #load the example vcfR object data(vcfR.example) ### check the metadata present in your vcf vcfR.example vcfR.example@fix[1:10,1:8] vcfR.example@gt[1:10,1:2] #Load the example popmap file. It is a standard two column popmap, where the first column must be named 'id' and contain individual sample identifiers matching the sample identifiers in the vcf file, and the second column must be named 'pop', and contain a population assignment for each sample. data(popmap) popmap ## ---- fig.height= 4, fig.width=6---------------------------------------------- #generate exploratory visualizations of depth and genotype quality for all called genotypes #hard_filter(vcfR=vcfR.example) #hard filter to minimum depth of 5, and minimum genotype quality of 30 vcfR<-hard_filter(vcfR=vcfR.example, depth = 5, gq = 30) ## ---- fig.height= 3, fig.width=4---------------------------------------------- #execute allele balance filter vcfR<-filter_allele_balance(vcfR) ## ---- fig.height= 3, fig.width=4---------------------------------------------- #visualize and pick appropriate max depth cutoff #max_depth(vcfR) #not running here to save space on visualizations #filter vcf by the max depth cutoff you chose vcfR<-max_depth(vcfR, maxdepth = 100) #check vcfR to see how many SNPs we have left vcfR ## ---- fig.height= 3, fig.width=4---------------------------------------------- #run function to visualize samples and return informative data.frame object miss<-missing_by_sample(vcfR=vcfR) #run function to drop samples above the threshold we want from the vcf #here I am setting a relatively lax cutoff vcfR<-missing_by_sample(vcfR=vcfR, cutoff = .9) #remove invariant sites generated by sample trimming and genotype filtering vcfR<-min_mac(vcfR, min.mac = 1) #update popmap by removing samples that have been filtered out popmap<-popmap[popmap$id %in% colnames(vcfR@gt)[-1],] ## ---- fig.height= 3, fig.width=4---------------------------------------------- #visualize missing data by SNP and the effect of various cutoffs on the missingness of each sample missing_by_snp(vcfR) ## ----------------------------------------------------------------------------- #assess missing data effects on clustering assess_missing_data_pca(vcfR = vcfR, popmap = popmap, thresholds = c(.8), clustering = FALSE) assess_missing_data_tsne(vcfR = vcfR, popmap = popmap, thresholds = c(.8), clustering = FALSE) ## ---- fig.height= 3, fig.width=4---------------------------------------------- #show me the samples with the most missing data at an 80% completeness threshold filt<-miss$missing.by.filter[miss$missing.by.filter$filt == .8,] filt[order(filt$snps.retained),] #drop the three samples with an excess of missing data at an 80% SNP completeness threshold vcfR<- vcfR[,colnames(vcfR@gt) != "A_coerulescens_396263" & colnames(vcfR@gt) != "A_woodhouseii_334134" & colnames(vcfR@gt) != "A_coerulescens_396256"] #remove invariant SNPs vcfR<-min_mac(vcfR, min.mac = 1) vcfR #update popmap by removing samples that have been filtered out popmap<-popmap[popmap$id %in% colnames(vcfR@gt)[-1],] ## ---- fig.height= 3, fig.width=4---------------------------------------------- #visualize missing data at various completeness thresholds missing_by_snp(vcfR) #all samples look good at most thresholds, because of the small size of this dataset, I will choose a 60% completeness threshold in order to retain as many SNPs as possible #filter vcfR vcfR<-missing_by_snp(vcfR, cutoff = .6) ## ----------------------------------------------------------------------------- #remove singletons (loci with only a single variant allele which have no phylogenetic signal) vcfR<-min_mac(vcfR = vcfR, min.mac = 2) #linkage filter vcf to thin SNPs to one per 500bp vcfR<-distance_thin(vcfR, min.distance = 500) #look at final stats for our filtered vcf file vcfR ## ----------------------------------------------------------------------------- #write out vcf #vcfR::write.vcf(vcfR, file = "~/Downloads/scrub.jay.example.filtered.vcf.gz")