## ----load_libraries----------------------------------------------------------- suppressPackageStartupMessages({ library("QFeatures") library("dplyr") library("tidyr") library("ggplot2") library("msqrob2") library("stringr") library("ExploreModelMatrix") library("kableExtra") library("ComplexHeatmap") library("scater") }) ## ----file_location------------------------------------------------------------ precursorFile <- "https://github.com/statOmics/msqrob2data/raw/refs/heads/main/dia/spikeinStaes/spikein248-staesetal2024.parquet" ## ----import_data-------------------------------------------------------------- precursors <- arrow::read_parquet(precursorFile) # function from the arrow package ## For older versions of Spectronaut, where the results are stored as tsv files. ## Note that the precursorFile then would point to "report.tsv" ## precursors <- data.table::fread(precursorFile, check.names = TRUE, integer64 = "double") knitr::kable(head(precursors)) ## ----select_variables--------------------------------------------------------- precursors <- precursors |> select( Run, Precursor.Id, Modified.Sequence, Stripped.Sequence, Precursor.Charge, Protein.Group, Protein.Names, Protein.Ids, Genes, Precursor.Quantity, Precursor.Normalised, Normalisation.Factor, Ms1.Area, Ms1.Normalised, PG.MaxLFQ, Q.Value, Lib.Q.Value, PG.Q.Value, Lib.PG.Q.Value, Proteotypic, Decoy, # Not available in older versions of DIA-NN RT) ## ----add_species_info--------------------------------------------------------- precursors <- precursors |> mutate(species = grepl(pattern = "UPS",Protein.Group) |> as.factor() |> recode("TRUE"="ups","FALSE" = "yeast")) precursors |> pull(species) |> table() ## ----check_intensity_distribution--------------------------------------------- precursors |> ggplot(aes(x = log2(Precursor.Quantity))) + geom_density() + theme_minimal() ## ----create_metadata---------------------------------------------------------- (annot <- data.frame(runCol = precursors |> pull(Run) |> unique() # 1. ) |> mutate(sampleId = gsub(x = runCol, pattern = "_DIA", replacement = "") |> #2.a str_split("UPS2_") |> #2.b sapply(`[`, 2) #2.c ) |> mutate( condition = gsub("ratio", "", sampleId) |> #3.a str_split("_") |> #3.b sapply(`[`, 1) |> #3.c as.numeric() |> #3.d as.factor(), #3.e rep = sampleId |> str_split("_") |> #4.a sapply(`[`, 2) |> #4.b replace_na(replace = "1") |> #4.c as.factor(), #4.d ratio = condition |> as.character() |> as.double() ) ) ## ----convert_to_QFeatures----------------------------------------------------- (qf <- readQFeatures(assayData = precursors, colData = annot, quantCols = "Precursor.Quantity", runCol = "Run", fnames = "Precursor.Id")) ## ----convert0_to_NA----------------------------------------------------------- qf <- zeroIsNA(qf, names(qf)) ## ----qvalue_filtering--------------------------------------------------------- qf <- qf |> filterFeatures(~ Q.Value <= 0.01 & #1. PG.Q.Value <= 0.01 & #1. Lib.Q.Value <= 0.01 & #1. Lib.PG.Q.Value <= 0.01 & #1. Precursor.Id != "" & #2. Decoy == 0 & #3. (Note, that this filter is not available with previous versions of DIA-NN, as the report.tsv file did not include a Decoy column. So other strategies are needed if Decoys are in the output file.) Proteotypic == 1) #4. ## ----assay_joining------------------------------------------------------------ (qf <- joinAssays( x = qf, i = names(qf), fcol = "Precursor.Id", name = "precursors" )) ## ----filter_features_with_many_NA--------------------------------------------- nObs <- 4 n <- ncol(qf[["precursors"]]) (qf <- filterNA(qf, i = "precursors", pNA = (n - nObs) / n)) ## ----filter_one_hit_wonders--------------------------------------------------- # Filter for peptides per protein pepsPerProtDf <- qf[["precursors"]] |> rowData() |> data.frame() |> dplyr::select("Stripped.Sequence", "Protein.Group") |> group_by(Protein.Group) |> mutate(pepsPerProt = Stripped.Sequence |> unique() |> length() ) #1. rowData(qf[["precursors"]])$pepsPerProt <- pepsPerProtDf$pepsPerProt #2. qf <- filterFeatures(qf, ~ pepsPerProt > 1, keep = TRUE) #3. ## ----log_transformation------------------------------------------------------- qf <- logTransform(qf, base = 2, i = "precursors", name = "precursors_log") ## ----normalisation_needed----------------------------------------------------- qf[, , "precursors_log"] |> #1. longForm(colvars = c( "rep", "condition")) |> #2. data.frame() |> filter(!is.na(value)) |> ggplot() + #3. aes(x = value, colour = condition, group = colname) + geom_density() + theme_minimal() ## ----normalize_peptides------------------------------------------------------- qf <- sweep( #4. Subtract log2 norm factor column-by-column (MARGIN = 2) qf, MARGIN = 2, STATS = nfLogMedian(qf,"precursors_log"), i = "precursors_log", name = "precursors_norm" ) ## ----assess_normalisation----------------------------------------------------- qf[, , "precursors_norm"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = value, colour = condition, group = colname) + geom_density() + labs(subtitle = "Normalised log2 precursor intensities") + theme_minimal() ## ----summarize_precursors_to_proteins, warning=FALSE-------------------------- (qf <- aggregateFeatures( qf, i = "precursors_norm", name = "proteins", fcol = "Protein.Group", fun = function(X) iq::maxLFQ(X)$estimate #fun = MsCoreUtils::medianPolish, na.rm = TRUE )) ## ----qc_normalisation_peptides------------------------------------------------ qf[, , "precursors_norm"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = value, colour = condition, group = colname) + geom_density() + theme_minimal() + labs(subtitle = "Normalised log2 precursor intensities") ## ----qc_normalisation_peptides_boxplot---------------------------------------- qf[, , "precursors_norm"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = sampleId, y = value, colour = condition, group = colname) + xlab("sample") + geom_boxplot() + theme_minimal() + labs(subtitle = "Normalised log2 precursor intensities") ## ----qc_normalisation_proteins------------------------------------------------ qf[, , "proteins"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = value, colour = condition, group = colname) + geom_density() + theme_minimal() + labs(subtitle = "Normalised log2 protein intensities") ## ----qc_normalisation_proteins_boxplot---------------------------------------- qf[, , "proteins"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = sampleId, y = value, colour = condition, group = colname) + xlab("sample") + geom_boxplot() + theme_minimal() + labs(subtitle = "Normalised log2 protein intensities") ## ----qc_charge_state---------------------------------------------------------- qf[, , "precursors_norm"] |> longForm(colvars = colnames(colData(qf)), rowvars = "Precursor.Charge") |> as.data.frame() |> filter(!is.na(value)) |> filter(Precursor.Charge<=4) |> ggplot(aes(x = sampleId)) + geom_bar(aes(fill = factor(Precursor.Charge, levels = 4:1)), colour = "black") + labs(title = "Peptide types per sample", x = "Sample", fill = "Charge state") + theme_bw() ## ----qc_identifications------------------------------------------------------- qf[,,"precursors_norm"] |> longForm(colvars = colnames(colData(qf)), rowvars= c("Precursor.Id", "Protein.Group", "Stripped.Sequence", "Precursor.Charge")) |> data.frame() |> filter(!is.na(value)) |> mutate(cond_rep = paste0("ratio", condition, "_", rep)) |> group_by(condition, cond_rep) |> summarise(Precursors = length(unique(Precursor.Id)), `Protein Groups` = length(unique(Protein.Group))) |> pivot_longer(-(1:2), names_to = "Feature", values_to = "IDs") |> ggplot(aes(x = cond_rep, y = IDs, fill = condition)) + geom_col() + #scale_fill_observable() + facet_wrap(~Feature, scales = "free_y") + labs(title = "Precursor and protein group identificiations per sample", x = "Sample", y = "Identifications") + theme_bw() + theme(axis.text.x = element_text(angle = 90)) ## ----qc_mds_proteins---------------------------------------------------------- getWithColData(qf, "proteins") |> as("SingleCellExperiment") |> runMDS(exprs_values = 1) |> plotMDS(colour_by = "condition", shape_by = "rep", point_size = 3) ## ----qc_correlation_proteins-------------------------------------------------- corMat <- qf[["proteins"]] |> assay() |> cor(method = "spearman", use = "pairwise.complete.obs") colnames(corMat) <- qf$sampleId rownames(corMat) <- qf$sampleId corMat |> ggcorrplot::ggcorrplot() + scale_fill_viridis_c() + labs(title = "Correlation matrix", fill = "Correlation") + theme(axis.text.x = element_text(angle = 90)) ## ----define_model------------------------------------------------------------- model <- ~ condition ## ----fit_models, warning=FALSE------------------------------------------------ qf <- msqrob( qf, i = "proteins", formula = model, robust = TRUE) ## ----print_models------------------------------------------------------------- models <- rowData(qf[["proteins"]])[["msqrobModels"]] models[1:3] ## ----explore_model_matrix----------------------------------------------------- vd <- ExploreModelMatrix::VisualizeDesign( sampleData = colData(qf), designFormula = model, textSizeFitted = 4 ) vd$plotlist ## ----define_H0---------------------------------------------------------------- (allHypotheses <- createPairwiseContrasts( model, colData(qf), "condition" ) ) ## ----make_contrasts----------------------------------------------------------- (L <- makeContrast( allHypotheses, parameterNames = colnames(vd$designmatrix) )) ## ----hypthesis_tests---------------------------------------------------------- qf <- hypothesisTest(qf, i = "proteins", contrast = L) ## ----show_tables_qf----------------------------------------------------------- qf[["proteins"]] |> rowData() |> names() ## ----collect_results---------------------------------------------------------- inferences <- msqrobCollect(qf[["proteins"]], L) head(inferences) ## ----add_species_info_results------------------------------------------------- inferences <- inferences |> mutate(species = rep(rowData(qf[["proteins"]])$species, ncol(L))) ## ----significant_tables, results='asis'--------------------------------------- alpha <- 0.05 #1. for (contrasti in colnames(L)) #2. { sigList <- inferences |> filter(contrast == contrasti & adjPval < alpha) #3. cat("**Contrast:**", contrasti, "= 0 (", nrow(sigList), " significant proteins)\n\n") #4. print(kable(sigList) |> kable_styling(full_width = FALSE) |> scroll_box(height = "250px") ) #4. cat("\n\n\n---\n\n") #4. } ## ----volcanoplots------------------------------------------------------------- volcanoplots <- inferences |> plotVolcano() + facet_wrap(~contrast) volcanoplots ## ----heatmaps----------------------------------------------------------------- heatmaps <- lapply( colnames(L), #1. function(contrast, se, alpha) { sig <- rowData(se)[[contrast]] |> #2. filter(adjPval < alpha) |> rownames() quants <- t(scale(t(assay(se[sig,])))) #3 colnames(quants) <- paste0("con", se$condition,"_rep",se$rep) #4. annotations <- columnAnnotation(condition = se$condition) #4. set.seed(1234) ## annotation colours are randomly generated by default return( #.5 Heatmap( quants, name = "log2 intensity", top_annotation = annotations, column_title = paste0(contrast, " = 0") ) ) }, se = getWithColData(qf, "proteins"), #6. alpha = alpha) #7. heatmaps ## ----define_alpha------------------------------------------------------------- alpha <- 0.05 ## ----real_logFC--------------------------------------------------------------- (realLogFC <- data.frame( logFC = colData(qf)$condition |> levels() |> # 1. as.double() |> # 2. log2() |> # 3. combn(m=2, FUN = diff), #4. contrast = colData(qf)$condition |> levels() |> combn(m=2, FUN= function(x) paste0(x[2]," - ",x[1])) |> recode("4 - 2" = "condition4", "8 - 2" = "condition8", "8 - 4" = "condition8 - condition4") ) ) ## ----assess_logFC------------------------------------------------------------- logFC <- inferences |> filter(!is.na(logFC)) |> ggplot(aes(x = species, y = logFC, color= species)) + #1. geom_boxplot() + #2. theme_bw() + scale_color_manual( values = c("grey20", "firebrick"), #3. name = "", labels = c("yeast", "ups") ) + geom_hline(yintercept = 0, color="grey20") + # 4. facet_wrap(~contrast) + geom_hline(aes(yintercept = logFC), color="firebrick", data=realLogFC) #5. logFC ## ----assess_TP_FP_FDP--------------------------------------------------------- tpFpTable <- group_by(inferences, contrast) |> filter(adjPval < alpha) |> summarise("TP" = sum(species == "ups"), "FP" = sum(species != "ups"), "FDP" = mean(species != "ups")) tpFpTable ## ----FDR_TPR_functions-------------------------------------------------------- computeFDP <- function(pval, tp) { ord <- order(pval) fdp <- cumsum(!tp[ord]) / 1:length(tp) fdp[order(ord)] } computeTPR <- function(pval, tp, nTP = NULL) { if (is.null(nTP)) nTP <- sum(tp) ord <- order(pval) tpr <- cumsum(tp[ord]) / nTP tpr[order(ord)] } ## ----calculate_performance---------------------------------------------------- performance <- inferences |> group_by(contrast) |> na.exclude() |> mutate(tpr = computeTPR(pval, species == "ups"), fdp = computeFDP(pval, species == "ups")) |> arrange(fdp) ## ----calculate_workpoints----------------------------------------------------- workPoints <- performance |> group_by(contrast) |> filter(adjPval < 0.05) |> slice_max(pval) ## ----plot_tpr_fdp------------------------------------------------------------- ggplot(performance) + aes( y = fdp, x = tpr, ) + geom_line() + geom_point(data = workPoints, size = 3) + geom_hline(yintercept = 0.05, linetype = 2) + facet_wrap( ~ contrast) + coord_flip(ylim = c(0, 0.2)) + theme_minimal() ## ----volcanoplots_with_groundtruth-------------------------------------------- inferences |> plotVolcano() + facet_wrap(~contrast) + aes(color = species, shape = adjPval < alpha) + scale_color_manual( values = c("grey20", "firebrick"), name = "species", labels = c("yeast", "ups") ) + labs(shape="FDR < 0.05") + geom_vline(aes(xintercept = logFC), data = realLogFC , col ="firebrick") + geom_hline(aes(yintercept = -log10(adjAlpha)), data = inferences |> group_by(contrast) |> summarize(adjAlpha = alpha *mean(adjPval < alpha, na.rm=TRUE), .groups="drop")) ## ----sessionInfo-------------------------------------------------------------- sessionInfo()