## ----style, echo = FALSE, results = 'asis'------------------------------------------------------------------ BiocStyle::markdown() ## ----global_options, include=FALSE-------------------------------------------------------------------------- knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE) options(width=110) ## ----------------------------------------------------------------------------------------------------------- library(MSstats) library(MSstatsConvert) library(data.table) library(ggplot2) ## ----------------------------------------------------------------------------------------------------------- # Spectronaut quant file spectronaut_raw = system.file( "tinytest/raw_data/Spectronaut/spectronaut_quality_input.csv", package = "MSstatsConvert") spectronaut_raw = data.table::fread(spectronaut_raw) # Annotation file annotation = system.file( "tinytest/raw_data/Spectronaut/annotation.csv", package = "MSstatsConvert") annotation = data.table::fread(annotation) # Create temporal run ordering data.table spectronaut_raw[, `R.Run Date (Formatted)` := as.POSIXct( `R.Run Date (Formatted)`, format = "%m/%d/%Y %I:%M:%S %p")] run_order = unique(spectronaut_raw[, c("R.Run Date (Formatted)", "R.FileName")]) run_order = run_order[order(`R.Run Date (Formatted)`)] run_order$Order = 1:nrow(run_order) setnames(run_order, c("R.FileName"), c("Run")) ## ----------------------------------------------------------------------------------------------------------- msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( spectronaut_raw, annotation, intensity = 'PeakArea', excludedFromQuantificationFilter = TRUE, # Pre-filtering calculateAnomalyScores=TRUE, # Key parameter to use MSstats+ anomalyModelFeatures=c("FGShapeQualityScore(MS2)", "FGShapeQualityScore(MS1)", "EGDeltaRT"), # Quality metrics anomalyModelFeatureTemporal=c( "mean_decrease", "mean_decrease", "dispersion_increase"), # Temporal direction of quality metrics runOrder=run_order, # Temporal order of MS runs n_trees=100, max_depth="auto", # iForest parameter (depth of trees) numberOfCores=1) # Cores to use for parallel processing (Mac or Linux only) head(msstats_input) ## ----------------------------------------------------------------------------------------------------------- health_info = CheckDataHealth(msstats_input) skew_score = health_info[[2]] ggplot(skew_score, aes(x = skew)) + geom_histogram(fill = "#E69F00", color = "black", bins=12) + geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 1.5) + theme_minimal(base_size = 16) + labs( title="Distribution of skewness of anomaly scores", x = "Pearson's moment coefficient of skewness", y = "Count" ) + theme( axis.text.x = element_text(size = 16), axis.text.y = element_text(size = 16), axis.title.x = element_text(size = 18), axis.title.y = element_text(size = 18) ) ## ----------------------------------------------------------------------------------------------------------- # Extract quality metrics from PSM output quality_join_cols = c("R.FileName", "PG.ProteinGroups", "EG.ModifiedSequence", "FG.Charge", "FG.ShapeQualityScore (MS1)", "FG.ShapeQualityScore (MS2)", "EG.DeltaRT") quality_join_df = spectronaut_raw[, ..quality_join_cols] quality_join_df$EG.ModifiedSequence = stringr::str_replace_all( quality_join_df$EG.ModifiedSequence, "\\_", "") quality_join_df = unique(quality_join_df) # In case there are duplicates (not common) quality_join_df = quality_join_df[ , lapply(.SD, mean, na.rm = TRUE), by = .(R.FileName, PG.ProteinGroups, EG.ModifiedSequence, FG.Charge), .SDcols = c("FG.ShapeQualityScore (MS1)", "FG.ShapeQualityScore (MS2)", "EG.DeltaRT") ] # Extract run order spectronaut_raw[, `R.Run Date (Formatted)` := as.POSIXct( `R.Run Date (Formatted)`, format = "%m/%d/%Y %I:%M:%S %p")] run_order = unique(spectronaut_raw[, c("R.Run Date (Formatted)", "R.FileName")]) run_order = run_order[order(`R.Run Date (Formatted)`)] run_order$Order = 1:nrow(run_order) setnames(run_order, c("R.FileName"), c("Run")) # Run standard MSstats converter without anomaly detection msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( spectronaut_raw, annotation, intensity = 'PeakArea', excludedFromQuantificationFilter = TRUE, # Pre-filtering calculateAnomalyScores=FALSE) # Turn anomaly detection off # Join in quality metrics msstats_input = merge(as.data.table(msstats_input), as.data.table(quality_join_df), all.x=TRUE, all.y=FALSE, by.x=c("Run", "ProteinName", "PeptideSequence", "PrecursorCharge"), by.y=c("R.FileName", "PG.ProteinGroups", "EG.ModifiedSequence", "FG.Charge")) # Run Anomaly Model separately input = MSstatsConvert::MSstatsAnomalyScores( msstats_input, c("FG.ShapeQualityScore (MS1)", "FG.ShapeQualityScore (MS2)", "EG.DeltaRT"), c("mean_decrease", "mean_decrease", "dispersion_increase"), .5, 100, run_order, 100, "auto", 1) ## ----------------------------------------------------------------------------------------------------------- summarized = dataProcess(msstats_input, normalization=FALSE, # no normalization MBimpute=TRUE, # Use imputation summaryMethod="linear" # Key MSstats+ parameter ) head(summarized$ProteinLevelData) ## ----------------------------------------------------------------------------------------------------------- comparison = matrix(c(-1,1),nrow=1) row.names(comparison) = "Condition2-Condition1" colnames(comparison) = c("Condition1", "Condition2") comparison_result = groupComparison( contrast.matrix = comparison, data=summarized) head(comparison_result$ComparisonResult) ## ----------------------------------------------------------------------------------------------------------- base_msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( spectronaut_raw, annotation, intensity = 'PeakArea', excludedFromQuantificationFilter = TRUE, # Pre-filtering calculateAnomalyScores=FALSE) # No anomaly model base_summarized = dataProcess(base_msstats_input, normalization=FALSE, # no normalization MBimpute=TRUE, # Use imputation summaryMethod="TMP" # Tukey median polish summarization ) base_comparison_result = groupComparison( contrast.matrix = comparison, data=base_summarized) head(base_comparison_result$ComparisonResult) ## ----------------------------------------------------------------------------------------------------------- comparison_result$ComparisonResult$Model = "MSstats+" base_comparison_result$ComparisonResult$Model = "MSstats" merged_results = rbindlist(list( comparison_result$ComparisonResult, base_comparison_result$ComparisonResult )) ggplot(data=merged_results) + geom_col(aes(x=Protein, y=abs(log2FC), fill=Model), position="dodge") + geom_hline(yintercept=1, linetype="dashed", color="black", linewidth=1) + theme_bw(base_size =16) + labs(title="Log fold change comparison", x="MSstats+", y="Base MSstats") ## ----------------------------------------------------------------------------------------------------------- ggplot(data=merged_results) + geom_col(aes(x=Protein, y=SE, fill=Model), position="dodge") + theme_bw(base_size =16) + labs(title="Standard error comparison", x="MSstats+", y="Base MSstats") ## ----eval=FALSE, include=TRUE------------------------------------------------------------------------------- # library(MSstatsBig) # # # Define file path to data # spectronaut_raw = system.file( # "tinytest/raw_data/Spectronaut/spectronaut_quality_input.csv", # package = "MSstatsConvert") # # # Run MSstatsBig converter # converted_data = bigSpectronauttoMSstatsFormat( # input_file=spectronaut_raw, # output_file_name="output_file.csv", # intensity = "F.PeakHeight", # filter_by_excluded = TRUE, # filter_by_identified = FALSE, # filter_by_qvalue = TRUE, # qvalue_cutoff = 0.01, # filter_unique_peptides = TRUE, # aggregate_psms = TRUE, # filter_few_obs = FALSE, # remove_annotation = FALSE, # calculateAnomalyScores=TRUE, # anomalyModelFeatures=c("FG.ShapeQualityScore (MS1)", # "FG.ShapeQualityScore (MS2)", # "EG.ApexRT"), # backend="arrow") # converted_data = dplyr::collect(converted_data) # # # Prepare info for anomaly model # # Get temporal ordering from input file (can also be done externally) # temporal = fread(spectronaut_raw)[ # , .SD[1], by = .(`R.Run Date (Formatted)`, R.FileName)] # # temporal = temporal[, Run.TimeParsed := as.POSIXct( # `R.Run Date (Formatted)`, format = "%m/%d/%Y %I:%M:%S %p")][ # order(Run.TimeParsed)][ # , !"Run.TimeParsed"] # # temporal$Order = 1:nrow(temporal) # temporal$Run = temporal$R.FileName # temporal = temporal[, c("Run", "Order")] # # # Specify anomaly model parameters # anomalyModelFeatures=c("FG.ShapeQualityScore (MS2)", # "FG.ShapeQualityScore (MS1)", # "EG.ApexRT") # anomalyModelFeatures = MSstatsConvert:::.standardizeColnames(anomalyModelFeatures) # # anomalyModelFeatureTemporal=c("mean_decrease", # "mean_decrease", # "dispersion_increase") # # n_trees=100 # max_depth="auto" # numberOfCores=1 # # # Run anomaly detection model # msstats_input = MSstatsConvert::MSstatsAnomalyScores( # as.data.table(converted_data), anomalyModelFeatures, # anomalyModelFeatureTemporal, .5, 100, temporal, n_trees, # max_depth, numberOfCores) # head(msstats_input) #