MsBackendRawFileReader
BackendMsBackendRawFileReader 1.2.0
suppressMessages(
stopifnot(require(Spectra),
require(MsBackendRawFileReader),
require(tartare),
require(BiocParallel))
)
assemblies aka Common Intermediate Language bytecode
The download and install can be done on all platforms using the command:
rawrr::installRawFileReaderDLLs()
if (isFALSE(rawrr::.checkDllInMonoPath())){
rawrr::installRawFileReaderDLLs()
}
## removing files in directory '/home/biocbuild/.cache/R/rawrr/rawrrassembly'
## ThermoFisher.CommonCore.Data.dll
## 0
## ThermoFisher.CommonCore.MassPrecisionEstimator.dll
## 0
## ThermoFisher.CommonCore.RawFileReader.dll
## 0
if (isFALSE(file.exists(rawrr:::.rawrrAssembly()))){
rawrr::installRawrrExe()
}
## MD5 fa6eb2f896d7c893e68a6c5596175e47 /home/biocbuild/.cache/R/rawrr/rawrrassembly/rawrr.exe
## [1] 0
# fetch via ExperimentHub
library(ExperimentHub)
eh <- ExperimentHub::ExperimentHub()
query(eh, c('tartare'))
## ExperimentHub with 5 records
## # snapshotDate(): 2022-04-19
## # $dataprovider: Functional Genomics Center Zurich (FGCZ)
## # $species: NA
## # $rdataclass: Spectra
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## # rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["EH3219"]]'
##
## title
## EH3219 | Q Exactive HF-X mzXML
## EH3220 | Q Exactive HF-X raw
## EH3221 | Fusion Lumos mzXML
## EH3222 | Fusion Lumos raw
## EH4547 | Q Exactive HF Orbitrap raw
The RawFileReader libraries require a file extension ending with .raw
.
EH3220 <- normalizePath(eh[["EH3220"]])
(rawfileEH3220 <- paste0(EH3220, ".raw"))
## [1] "/var/cache/biocbuild_cache/R/ExperimentHub/29d1dbe636e2_3236.raw"
if (!file.exists(rawfileEH3220)){
file.link(EH3220, rawfileEH3220)
}
EH3222 <- normalizePath(eh[["EH3222"]])
(rawfileEH3222 <- paste0(EH3222, ".raw"))
## [1] "/var/cache/biocbuild_cache/R/ExperimentHub/29d1d474c36e0_3238.raw"
if (!file.exists(rawfileEH3222)){
file.link(EH3222, rawfileEH3222)
}
EH4547 <- normalizePath(eh[["EH4547"]])
(rawfileEH4547 <- paste0(EH4547 , ".raw"))
## [1] "/var/cache/biocbuild_cache/R/ExperimentHub/29d1dbf5187d_4590.raw"
if (!file.exists(rawfileEH4547 )){
file.link(EH4547 , rawfileEH4547 )
}
Call the constructor
beRaw <- Spectra::backendInitialize(
MsBackendRawFileReader::MsBackendRawFileReader(),
files = c(rawfileEH3220, rawfileEH3222, rawfileEH4547))
Call the print method
beRaw
## MsBackendRawFileReader with 32497 spectra
## msLevel rtime scanIndex
## <integer> <numeric> <integer>
## 1 1 0.215 1
## 2 1 0.714 2
## 3 1 1.108 3
## 4 1 1.503 4
## 5 1 1.897 5
## ... ... ... ...
## 32493 2 2099.70 21876
## 32494 2 2099.78 21877
## 32495 2 2099.87 21878
## 32496 2 2099.95 21879
## 32497 2 2100.04 21880
## ... 21 more variables/columns.
##
## file(s):
## 29d1dbe636e2_3236.raw
## 29d1d474c36e0_3238.raw
## 29d1dbf5187d_4590.raw
Here we reproduce the Figure 2 of Kockmann and Panse (2021) rawrr.
The MsBackendRawFileReader ships with a
filterScan
method using functionality provided by the C# libraries by
Thermo Fisher Scientific Shofstahl (2016).
(S <- (beRaw |>
filterScan("FTMS + c NSI Full ms2 487.2567@hcd27.00 [100.0000-1015.0000]") )[437]) |>
plotSpectra()
# supposed to be scanIndex 9594
S
## MsBackendRawFileReader with 1 spectra
## msLevel rtime scanIndex
## <integer> <numeric> <integer>
## 1 2 925.225 9594
## ... 21 more variables/columns.
##
## file(s):
## 29d1dbf5187d_4590.raw
# add yIonSeries to the plot
(yIonSeries <- protViz::fragmentIon("LGGNEQVTR")[[1]]$y[1:8])
## [1] 175.1190 276.1666 375.2350 503.2936 632.3362 746.3791 803.4006 860.4221
names(yIonSeries) <- paste0("y", seq(1, length(yIonSeries)))
abline(v = yIonSeries, col='#DDDDDD88', lwd=5)
axis(3, yIonSeries, names(yIonSeries))
For demonstration reasons, we extent the MsBackend
class by a filter method.
The filterIons
function returns spectra if and only if all fragment ions,
given as argument, match. We use protViz::findNN
binary search
method for determining the nearest mZ peak for each ion.
If the mass error between an ion and an mz value is less than the given mass
tolerance, an ion is considered a hit.
setGeneric("filterIons", function(object, ...) standardGeneric("filterIons"))
## [1] "filterIons"
setMethod("filterIons", "MsBackend",
function(object, mZ=numeric(), tol=numeric(), ...) {
keep <- lapply(peaksData(object, BPPARAM = bpparam()),
FUN=function(x){
NN <- protViz::findNN(mZ, x[, 1])
hit <- (error <- mZ - x[NN, 1]) < tol & x[NN, 2] >= quantile(x[, 2], .9)
if (sum(hit) == length(mZ))
TRUE
else
FALSE
})
object[unlist(keep)]
})
The lines below implement a simple targeted peptide search engine.
The R code snippet takes as input a MsBackendRawFileReader
object
containing 32497 spectra and y-fragment-ion mZ values determined
for LGGNEQVTR++
.
start_time <- Sys.time()
X <- beRaw |>
MsBackendRawFileReader::filterScan("FTMS + c NSI Full ms2 487.2567@hcd27.00 [100.0000-1015.0000]") |>
filterIons(yIonSeries, tol = 0.005) |>
Spectra::Spectra() |>
Spectra::peaksData()
end_time <- Sys.time()
The defined filterIons
method runs on
995 input spectra and returns 4 spectra.
The runtime is shown below.
end_time - start_time
## Time difference of 4.218687 secs
Next, we define and apply a method for graphing LGGNEQVTR
peptide spectrum
matches. Also, the function returns some statistics of the match.
## A helper plot function to visualize a peptide spectrum match for
## the LGGNEQVTR peptide.
.plot.LGGNEQVTR <- function(x){
yIonSeries <- protViz::fragmentIon("LGGNEQVTR")[[1]]$y[1:8]
names(yIonSeries) <- paste0("y", seq(1, length(yIonSeries)))
plot(x, type = 'h', xlim = range(yIonSeries))
abline(v = yIonSeries, col = '#DDDDDD88', lwd=5)
axis(3, yIonSeries, names(yIonSeries))
# find nearest mZ value
idx <- protViz::findNN(yIonSeries, x[,1])
data.frame(
ion = names(yIonSeries),
mZ.yIon = yIonSeries,
mZ = x[idx, 1],
intensity = x[idx, 2]
)
}
stats::aggregate(mZ ~ ion, data = XC, FUN = base::mean)
## ion mZ
## 1 y1 175.1190
## 2 y2 276.1665
## 3 y3 375.2349
## 4 y4 503.2936
## 5 y5 632.3362
## 6 y6 746.3791
## 7 y7 803.4003
## 8 y8 860.4216
stats::aggregate(intensity ~ ion, data = XC, FUN = base::max)
## ion intensity
## 1 y1 1505214
## 2 y2 2583122
## 3 y3 2364014
## 4 y4 3179124
## 5 y5 2286947
## 6 y6 1236341
## 7 y7 4586484
## 8 y8 12894520
For the sake of demonstration we apply the Spectra::combinePeaks
method and
aggregate the 4 spectra into a singe peak matrix.
The statistics returned by .plot.LGGNEQVTR()
should be identical with the
output of the aggregation code snippet above.
X |>
Spectra::combinePeaks(ppm=10, intensityFun=base::max) |>
.plot.LGGNEQVTR()
## ion mZ.yIon mZ intensity
## y1 y1 175.1190 175.1190 1505214
## y2 y2 276.1666 276.1665 2583122
## y3 y3 375.2350 375.2349 2364014
## y4 y4 503.2936 503.2936 3179124
## y5 y5 632.3362 632.3362 2286947
## y6 y6 746.3791 746.3791 1236341
## y7 y7 803.4006 803.4003 4586484
## y8 y8 860.4221 860.4216 12894520
Below we demonstrate the interaction with the MsBackendMgf package while composing a Mascot Generic Format mgf file which is compatible for conducting an MS/MS Ions Search using Mascot Server (>=2.7) Perkins et al. (1999).
## Map Spectra variables to Mascot Server compatible vocabulary.
map <- c(custom = "TITLE",
msLevel = "CHARGE",
scanIndex = "SCANS",
precursorMz = "PEPMASS",
rtime = "RTINSECONDS")
## Compose custom TITLE
beRaw$custom <- paste0("File: ", beRaw$dataOrigin, " ; SpectrumID: ", S$scanIndex)
(mgf <- tempfile(fileext = '.mgf'))
## [1] "/tmp/RtmpXjpSkS/file8a8f44311eeac.mgf"
(beRaw |>
filterScan("FTMS + c NSI Full ms2 487.2567@hcd27.00 [100.0000-1015.0000]") )[437] |>
Spectra::Spectra() |>
Spectra::selectSpectraVariables(c("rtime", "precursorMz",
"precursorCharge", "msLevel", "scanIndex", "custom")) |>
MsBackendMgf::export(backend = MsBackendMgf::MsBackendMgf(),
file = mgf, map = map)
readLines(mgf) |> head(12)
## [1] "BEGIN IONS"
## [2] "CHARGE=2+"
## [3] "RTINSECONDS=925.225"
## [4] "SCANS=9594"
## [5] "PEPMASS=487.256713867188"
## [6] "TITLE=File: /var/cache/biocbuild_cache/R/ExperimentHub/29d1dbf5187d_4590.raw ; SpectrumID: 9594"
## [7] "101.071502685547 74105.4765625"
## [8] "102.05549621582 105530.4765625"
## [9] "115.05054473877 158732.1875"
## [10] "115.086776733398 75867.9140625"
## [11] "124.144035339355 45457.22265625"
## [12] "127.050369262695 295541.8125"
readLines(mgf) |> tail()
## [1] "862.427612304688 154045.78125" "870.404846191406 159569.8125"
## [3] "871.395141601563 196302.6875" "880.671020507813 65916"
## [5] "END IONS" ""
To extract all tandem spectra you can use the code snippets below
S <- Spectra::backendInitialize(
MsBackendRawFileReader::MsBackendRawFileReader(),
files = c(rawfileEH4547)) |>
Spectra()
S
## MSn data (Spectra) with 21880 spectra in a MsBackendRawFileReader backend:
## msLevel rtime scanIndex
## <integer> <numeric> <integer>
## 1 1 0.155 1
## 2 2 0.412 2
## 3 2 0.497 3
## 4 2 0.583 4
## 5 2 0.668 5
## ... ... ... ...
## 21876 2 2099.70 21876
## 21877 2 2099.78 21877
## 21878 2 2099.87 21878
## 21879 2 2099.95 21879
## 21880 2 2100.04 21880
## ... 21 more variables/columns.
##
## file(s):
## 29d1dbf5187d_4590.raw
S |>
MsBackendMgf::export(backend = MsBackendMgf::MsBackendMgf(),
file = mgf,
map = map)
Next, we generate a mgf file for each scan type. This is helpful, e.g, for optimizing search settings tandem mass spectrometry sequence database search tool as comet Eng, Jahan, and Hoopmann (2012) or mascot server Perkins et al. (1999).
## Define scanType patterns
scanTypePattern <- list(
EThcD.lowres = "ITMS.+sa Full ms2.+@etd.+@hcd.+",
ETciD.lowres = "ITMS.+sa Full ms2.+@etd.+@cid.+",
CID.lowres = "ITMS[^@]+@cid[^@]+$",
HCD.lowres = "ITMS[^@]+@hcd[^@]+$",
EThcD.highres = "FTMS.+sa Full ms2.+@etd.+@hcd.+",
HCD.highres = "FTMS[^@]+@hcd[^@]+$"
)
beRaw <- Spectra::backendInitialize(
MsBackendRawFileReader::MsBackendRawFileReader(),
files = c(rawrr::sampleFilePath()))
beRaw <- Spectra::backendInitialize(
MsBackendRawFileReader::MsBackendRawFileReader(),
files = rawrr::sampleFilePath())
beRaw$custom <- paste0("File: ", gsub("/srv/www/htdocs/Data2San/", "", beRaw$dataOrigin), " ; SpectrumID: ", beRaw$scanIndex)
.generate_mgf <- function(ext, pattern, dir=tempdir(), ...){
mgf <- file.path(dir, paste0(sub("\\.raw", "", unique(basename(beRaw$dataOrigin))),
".", ext, ".mgf"))
idx <- beRaw$scanType |> grepl(patter=pattern)
if (sum(idx) == 0) return (NULL)
message(paste0("Extracting ", sum(idx), " ",
pattern, " scans\n\t to file ", mgf, " ..."))
beRaw[which(idx)] |>
Spectra::Spectra() |>
Spectra::selectSpectraVariables(c("rtime", "precursorMz",
"precursorCharge", "msLevel", "scanIndex", "custom")) |>
MsBackendMgf::export(backend = MsBackendMgf::MsBackendMgf(),
file = mgf,
map = map)
mgf
}
#mapply(ext = names(scanTypePattern),
# scanTypePattern,
# FUN = .generate_mgf) |>
# lapply(FUN = function(f){if (file.exists(f)) {readLines(f) |> head()}})
Given the task, we want to filter an MS2 of peak list recorded on an Orbitrap device to be interested only in the top peak within 100 Da mass windows. The following code snippet will demonstrate a solution.
## Define a function that takes a matrix as input and derives
## the top n most intense peaks within a mass window.
## Of note, here, we require centroided data. (no profile mode!)
MsBackendRawFileReader:::.top_n
## function (x, n = 10, mass_window = 100, ...)
## {
## if (nrow(x) < n) {
## return(x)
## }
## idx <- unlist(lapply(seq(0, 2000, by = mass_window), function(mZ) {
## i <- which((mZ < x[, 1] & x[, 1] <= mZ + mass_window))
## r <- i[order(x[, 2][i], decreasing = TRUE)]
## if (length(x[, 2]) > length(i))
## return(r[1:n])
## return(r)
## }, ...))
## x[sort(idx[!is.na(idx)]), ]
## }
## <bytecode: 0x563d67aa6820>
## <environment: namespace:MsBackendRawFileReader>
We add our custom code to the processing queue of the Spectra object.
Of note, we use n = 1
in praxis n = 10
for a 100 Da mass window, which seems to be a practical choice.
S_2 <- Spectra::addProcessing(S, MsBackendRawFileReader:::.top_n, n = 1)
The plot below displays a visual control of the custom filter function top_n.
On the top is the original spectrum, and the filtered one on the bottom. A point indicates peaks that match.
Spectra::plotSpectraMirror(S[9594], S_2[9594], ppm = 50)
The next snippet prints the values of the filtered peaklist and the mZ values of the y-ions.
S_2[9594] |> mz() |> unlist()
## [1] 171.1129 276.1667 375.2351 486.2656 503.2942 632.3369 746.3797 860.4223
yIonSeries
## y1 y2 y3 y4 y5 y6 y7 y8
## 175.1190 276.1666 375.2350 503.2936 632.3362 746.3791 803.4006 860.4221
When reading spectra the
MsBackendRawFileReader:::.RawFileReader_read_peaks
method is calling the
rawrr::readSpectrum
method.
The figure below displays the time performance for reading a single spectrum in dependency from the chunk size (how many spectra are read in one function call) for reading different numbers of overall spectra.
ioBm <- file.path(system.file(package = 'MsBackendRawFileReader'),
'extdata', 'specs.csv') |>
read.csv2(header=TRUE)
# perform and include a local IO benchmark
ioBmLocal <- ioBenchmark(1000, c(32, 64, 128, 256), rawfile = rawfileEH4547)
lattice::xyplot((1 / as.numeric(time)) * workers ~ size | factor(n) ,
group = host,
data = rbind(ioBm, ioBmLocal),
horizontal = FALSE,
scales=list(y = list(log = 10)),
auto.key = TRUE,
layout = c(3, 1),
ylab = 'spectra read in one second',
xlab = 'number of spectra / file')
We compare the output of the Thermo Fischer Scientific raw files versus
their corresponding mzXML files using Spectra::MsBackendMzR
relying on the
mzR package.
mzXMLEH3219 <- normalizePath(eh[["EH3219"]])
## see ?tartare and browseVignettes('tartare') for documentation
## loading from cache
mzXMLEH3221 <- normalizePath(eh[["EH3221"]])
## see ?tartare and browseVignettes('tartare') for documentation
## loading from cache
if (require(mzR)){
beMzXML <- Spectra::backendInitialize(
Spectra::MsBackendMzR(),
files = c(mzXMLEH3219))
beRaw <- Spectra::backendInitialize(
MsBackendRawFileReader::MsBackendRawFileReader(),
files = c(rawfileEH3220))
intensity.xml <- sapply(intensity(beMzXML[1:100]), sum)
intensity.raw <- sapply(intensity(beRaw[1:100]), sum)
plot(intensity.xml ~ intensity.raw, log = 'xy', asp = 1,
pch = 16, col = rgb(0.5, 0.5, 0.5, alpha=0.5), cex=2)
abline(lm(intensity.xml ~ intensity.raw),
col='red')
}
Are all scans of the raw file in the mzXML file?
if (require(mzR)){
table(scanIndex(beRaw) %in% scanIndex(beMzXML))
}
##
## FALSE TRUE
## 112 1764
sessionInfo()
## R version 4.2.0 RC (2022-04-19 r82224)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.15-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.15-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] mzR_2.30.0 Rcpp_1.0.8.3
## [3] tartare_1.9.0 ExperimentHub_2.4.0
## [5] AnnotationHub_3.4.0 BiocFileCache_2.4.0
## [7] dbplyr_2.1.1 MsBackendRawFileReader_1.2.0
## [9] Spectra_1.6.0 ProtGenerics_1.28.0
## [11] BiocParallel_1.30.0 S4Vectors_0.34.0
## [13] BiocGenerics_0.42.0 BiocStyle_2.24.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 fs_1.5.2
## [3] bit64_4.0.5 filelock_1.0.2
## [5] httr_1.4.2 GenomeInfoDb_1.32.0
## [7] tools_4.2.0 bslib_0.3.1
## [9] utf8_1.2.2 R6_2.5.1
## [11] DBI_1.1.2 withr_2.5.0
## [13] tidyselect_1.1.2 bit_4.0.4
## [15] curl_4.3.2 compiler_4.2.0
## [17] cli_3.3.0 Biobase_2.56.0
## [19] bookdown_0.26 sass_0.4.1
## [21] rappdirs_0.3.3 stringr_1.4.0
## [23] digest_0.6.29 rmarkdown_2.14
## [25] XVector_0.36.0 pkgconfig_2.0.3
## [27] htmltools_0.5.2 fastmap_1.1.0
## [29] highr_0.9 rlang_1.0.2
## [31] RSQLite_2.2.12 shiny_1.7.1
## [33] jquerylib_0.1.4 generics_0.1.2
## [35] jsonlite_1.8.0 dplyr_1.0.8
## [37] RCurl_1.98-1.6 magrittr_2.0.3
## [39] GenomeInfoDbData_1.2.8 fansi_1.0.3
## [41] MsCoreUtils_1.8.0 lifecycle_1.0.1
## [43] stringi_1.7.6 yaml_2.3.5
## [45] MASS_7.3-57 zlibbioc_1.42.0
## [47] grid_4.2.0 blob_1.2.3
## [49] parallel_4.2.0 promises_1.2.0.1
## [51] crayon_1.5.1 lattice_0.20-45
## [53] Biostrings_2.64.0 KEGGREST_1.36.0
## [55] magick_2.7.3 knitr_1.38
## [57] pillar_1.7.0 codetools_0.2-18
## [59] rawrr_1.4.0 glue_1.6.2
## [61] BiocVersion_3.15.2 evaluate_0.15
## [63] BiocManager_1.30.17 png_0.1-7
## [65] vctrs_0.4.1 httpuv_1.6.5
## [67] purrr_0.3.4 clue_0.3-60
## [69] assertthat_0.2.1 MsBackendMgf_1.4.0
## [71] cachem_1.0.6 xfun_0.30
## [73] mime_0.12 protViz_0.7.3
## [75] xtable_1.8-4 later_1.3.0
## [77] ncdf4_1.19 tibble_3.1.6
## [79] AnnotationDbi_1.58.0 memoise_2.0.1
## [81] IRanges_2.30.0 cluster_2.1.3
## [83] ellipsis_0.3.2 interactiveDisplayBase_1.34.0
Eng, Jimmy K., Tahmina A. Jahan, and Michael R. Hoopmann. 2012. “Comet: An Open-Source MS/MS Sequence Database Search Tool.” PROTEOMICS 13 (1): 22–24. https://doi.org/10.1002/pmic.201200439.
Kockmann, Tobias, and Christian Panse. 2021. “The Rawrr R Package: Direct Access to Orbitrap Data and Beyond.” Journal of Proteome Research. https://doi.org/10.1021/acs.jproteome.0c00866.
Perkins, David N., Darryl J. C. Pappin, David M. Creasy, and John S. Cottrell. 1999. “Probability-Based Protein Identification by Searching Sequence Databases Using Mass Spectrometry Data.” Electrophoresis 20 (18): 3551–67. https://doi.org/10.1002/(sici)1522-2683(19991201)20:18<3551::aid-elps3551>3.0.co;2-2.
Shofstahl, Jim. 2016. “New Rawfilereader from Thermo Fisher Scientific.” 2016. https://planetorbitrap.com/rawfilereader.