specL 1.12.0
!!!Caution work in progress!!!
Function optimizes Extraction windows so we have the same number of precursor per window. To do it uses spectral library or nonredundant blib.
specL contains a function specL::cdsw.
## Loading required package: DBI## Loading required package: protViz## Loading required package: RSQLite## Loading required package: seqinr## 
## Attaching package: 'specL'## The following objects are masked from 'package:protViz':
## 
##     plot.psm, plot.psmSet, summary.psmSetquantile# moves the windows start and end to regions where no peaks are observed
.makenewfromto <- function( windfrom, empty , isfrom=TRUE){
  newfrom <- NULL
  for(from in windfrom){
    idx <- which.min(abs(from - empty))
    startmass <- 0
    if(isfrom){
      if(empty[idx] < from) {
        startmass <- empty[idx]
      } else {
        startmass <- empty[idx-1]
      }
    }else{
      if(empty[idx] > from) {
        startmass <- empty[idx]
      } else {
        startmass <- empty[idx+1]
      }
    }
    newfrom <- c(newfrom, round(startmass,digits=1))
  }
  return(newfrom)
}
.cdsw_compute_breaks <- 
  function(xx, nbins){
    q <- quantile(xx, seq(0, 1, length = nbins + 1))
    q[1] <- q[1] - 0.5
    q[length(q)] <- q[length(q)] + 0.5
    q <- round(q)
  }
cdsw <-
  function(x, massrange = c(300,1250), n = 20, overlap = 1.0, FUN, ...) {
    if (class(x) == "psmSet") {
      x <- unlist(lapply(x, function(x) {
        x$pepmass
      }))
    } else if (class(x) == 'specLSet') {
      x <- unlist(lapply(x@ionlibrary, function(xx) {
        xx@q1
      }))
    }
    # x should be numeric
    if (class(x) != "numeric") {
      warning("can not compute quantils. 'x' is not numeric.")
      return (NULL)
    }
    
    x <- x[x > massrange[1] & x < massrange[2]]
    
    q <- FUN(xx=x, nbins=n)
    
    idx <- 1:n
    from <- q[idx] - overlap * 0.5
    to <- q[idx + 1] + overlap * 0.5
    width <- 0.5 * (to - from)
    mid <- from + width
    h <- hist(x, breaks = q, ...)
    data.frame(from, to, mid, width, counts = h$counts)
    
    
  }cdsw(exampledata, 
     freq=TRUE, 
     overlap = 0, 
     main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)##     from   to    mid width counts
## 0%   301  384  342.5  41.5    586
## 5%   384  420  402.0  18.0    591
## 10%  420  449  434.5  14.5    583
## 15%  449  476  462.5  13.5    599
## 20%  476  500  488.0  12.0    565
## 25%  500  524  512.0  12.0    604
## 30%  524  547  535.5  11.5    566
## 35%  547  572  559.5  12.5    598
## 40%  572  598  585.0  13.0    591
## 45%  598  625  611.5  13.5    577
## 50%  625  651  638.0  13.0    603
## 55%  651  682  666.5  15.5    575
## 60%  682  713  697.5  15.5    594
## 65%  713  745  729.0  16.0    574
## 70%  745  783  764.0  19.0    595
## 75%  783  825  804.0  21.0    572
## 80%  825  881  853.0  28.0    592
## 85%  881  948  914.5  33.5    583
## 90%  948 1045  996.5  48.5    592
## 95% 1045 1250 1147.5 102.5    586.cdsw_objective <- function(splits, data){
  counts <- hist(data, breaks=splits,plot=FALSE)$counts
  nbins<-length(splits)-1
  optimumN <- length(data)/(length(splits)-1)
  optimumN<-rep(optimumN,nbins)
  
  score2 <-sqrt(sum((counts - optimumN)^2))
  score1 <- sum(abs(counts - round(optimumN)))
  return(list(score1=score1,score2 = score2, counts=counts, optimumN=round(optimumN)))
}
.cdsw_hardconstrain <- function(splits, minwindow = 5, maxwindow=50){
  
  difsp<-diff(splits)
  return(sum(difsp >= minwindow) == length(difsp) & sum(difsp <= maxwindow) == length(difsp))
}.cdsw_compute_sampling_breaks <- function(xx, nbins=35, maxwindow=150, minwindow = 5, plot=TRUE){
  breaks <- nbins+1
  #xx <- x
  
  #xx<-xx[xx >=310 & xx<1250]
  # TODO(wew): there is something insconsitent with the nbins parameter
  qqs <- quantile(xx,probs = seq(0,1,by=1/(nbins)))
  
  plot(1:breaks, qqs, type="b" , 
       sub=".cdsw_compute_sampling_breaks")
  legend("topleft", legend = c(paste("maxwindow = ", maxwindow),
                               paste("nbins = ", breaks) ))
  
  
  # equidistant spaced bins
  unif <- seq(min(xx),max(xx),length=(breaks))
  lines(1:breaks,unif,col=2,type="b")
  
  if(!.cdsw_hardconstrain(unif,minwindow = 5, maxwindow)){
    warning("there is no way to generate bins given minwindow " , minwindow, "maxwindow", maxwindow, " breaks" , breaks, "\n")
  }else{
    .cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)
  }
  
  mixeddata <- xx
  it_count <- 0
  error <- 0
  
  while(!.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)){
    it_count <- it_count + 1
    uniformdata<-runif(round(length(xx)/20), min=min(xx), max=max(xx))
    mixeddata<-c(mixeddata,uniformdata)
    
    qqs <- quantile(mixeddata,probs = seq(0,1,by=1/(nbins)))
    lines(1:breaks,qqs,type="l", col="#00DD00AA")
    error[it_count] <-.cdsw_objective(qqs, xx)$score1
    
  }
  
  lines(1:breaks,qqs,type="b", col="#FF1111AA")
  plot(error, xlab="number of iteration", sub=".cdsw_compute_sampling_breaks")
  
  
  qqs <- as.numeric(sort(round(qqs)))
  qqs[1]  <- qqs[1] - 0.5
  qqs[length(qqs)]  <- qqs[length(qqs)] + 0.5
  
  round(qqs, 1)
}op <- par(mfrow=c(2,2))
par(mfrow=c(3,1))
wind <- cdsw(exampledata, 
             freq=TRUE,
             plot=TRUE,
             overlap = 0, 
             n=35,
             massrange = c(350,1250),
             sub='sampling based',
             main = "peptideStd", xlab='pepmass', FUN=function(...){.cdsw_compute_sampling_breaks(...,maxwindow = 50)})## Warning in plot.histogram(r, freq = freq1, col = col, border = border, angle
## = angle, : the AREAS in the plot are wrong -- rather use 'freq = FALSE'readjustWindows <- function(wind ,ms1data, breaks=10000, maxbin = 5){
  res <- hist(ms1data, breaks=breaks)
  abline(v=wind$from,col=2,lty=2)
  abline(v=wind$to,col=3,lty=2)
  
  empty <- res$mids[which(res$counts < maxbin )]
  newfrom <- .makenewfromto(wind$from , empty)
  newto <- .makenewfromto(wind$to , empty , isfrom=FALSE )
  
  plot(res,xlim=c(1060,1065))
  abline(v = newfrom,lwd=0.5,col="red")
  abline(v = newto , lwd=0.5,col="green")
  plot(res,xlim=c(520,550))
  abline(v = newfrom,lwd=0.5,col="red")
  abline(v = newto , lwd=0.5,col="green")
  
  width <- (newto - newfrom) * 0.5
  mid <- (newfrom + newto)*0.5
  newCounts <- NULL
  for(i in 1:length(newfrom))
  {
    newCounts <- c(newCounts,sum(ms1data >= newfrom[i] & ms1data <= newto[i]))
  }
  data.frame(newfrom, newto, mid, width, counts =newCounts)
}
readjustWindows(wind,exampledata)##    newfrom  newto     mid width counts
## 1    349.5  377.1  363.30 13.80    282
## 2    377.0  401.1  389.05 12.05    346
## 3    401.0  423.1  412.05 11.05    381
## 4    423.0  441.1  432.05  9.05    382
## 5    441.0  461.1  451.05 10.05    401
## 6    461.0  479.1  470.05  9.05    424
## 7    479.0  496.1  487.55  8.55    414
## 8    496.0  513.0  504.50  8.50    423
## 9    513.0  530.0  521.50  8.50    402
## 10   530.0  547.0  538.50  8.50    431
## 11   547.0  565.0  556.00  9.00    434
## 12   565.0  582.0  573.50  8.50    397
## 13   582.0  600.0  591.00  9.00    405
## 14   600.0  619.0  609.50  9.50    413
## 15   619.0  636.0  627.50  8.50    384
## 16   636.0  655.0  645.50  9.50    422
## 17   655.0  677.0  666.00 11.00    389
## 18   677.0  696.0  686.50  9.50    398
## 19   696.0  717.0  706.50 10.50    383
## 20   717.0  738.0  727.50 10.50    371
## 21   738.0  759.0  748.50 10.50    376
## 22   759.0  784.2  771.60 12.60    359
## 23   784.0  810.0  797.00 13.00    362
## 24   810.0  837.0  823.50 13.50    338
## 25   837.0  867.0  852.00 15.00    313
## 26   867.0  898.0  882.50 15.50    285
## 27   898.0  929.1  913.55 15.55    273
## 28   929.0  960.1  944.55 15.55    261
## 29   960.0  995.1  977.55 17.55    218
## 30   995.0 1031.0 1013.00 18.00    200
## 31  1031.0 1069.0 1050.00 19.00    192
## 32  1069.0 1108.0 1088.50 19.50    174
## 33  1108.0 1152.0 1130.00 22.00    121
## 34  1152.0 1201.0 1176.50 24.50    103
## 35  1201.0 1249.5 1225.25 24.25     69cdsw(exampledata, 
     freq=TRUE,
     plot=TRUE,
     n=35,
     overlap = 0, 
     sub='quantile based',
     main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)## Warning in plot.histogram(r, freq = freq1, col = col, border = border, angle
## = angle, : the AREAS in the plot are wrong -- rather use 'freq = FALSE'##           from   to    mid width counts
## 0%         301  363  332.0  31.0    332
## 2.857143%  363  390  376.5  13.5    343
## 5.714286%  390  410  400.0  10.0    328
## 8.571429%  410  429  419.5   9.5    331
## 11.42857%  429  445  437.0   8.0    339
## 14.28571%  445  462  453.5   8.5    347
## 17.14286%  462  476  469.0   7.0    339
## 20%        476  489  482.5   6.5    315
## 22.85714%  489  503  496.0   7.0    333
## 25.71429%  503  517  510.0   7.0    353
## 28.57143%  517  531  524.0   7.0    338
## 31.42857%  531  544  537.5   6.5    316
## 34.28571%  544  557  550.5   6.5    337
## 37.14286%  557  572  564.5   7.5    341
## 40%        572  586  579.0   7.0    337
## 42.85714%  586  601  593.5   7.5    324
## 45.71429%  601  617  609.0   8.0    350
## 48.57143%  617  632  624.5   7.5    332
## 51.42857%  632  646  639.0   7.0    321
## 54.28571%  646  663  654.5   8.5    342
## 57.14286%  663  682  672.5   9.5    340
## 60%        682  698  690.0   8.0    336
## 62.85714%  698  716  707.0   9.0    323
## 65.71429%  716  736  726.0  10.0    350
## 68.57143%  736  754  745.0   9.0    328
## 71.42857%  754  776  765.0  11.0    335
## 74.28571%  776  800  788.0  12.0    336
## 77.14286%  800  825  812.5  12.5    327
## 80%        825  855  840.0  15.0    344
## 82.85714%  855  891  873.0  18.0    330
## 85.71429%  891  928  909.5  18.5    334
## 88.57143%  928  969  948.5  20.5    340
## 91.42857%  969 1029  999.0  30.0    337
## 94.28571% 1029 1097 1063.0  34.0    332
## 97.14286% 1097 1250 1173.5  76.5    336op <-par(mfrow=c(4,3))
res <- lapply(c(75,150,300, 800), function(mws){
  cdsw(exampledata, 
       freq=TRUE,
       plot=TRUE,
       overlap = 0, 
       main = paste("max window size", mws), xlab='pepmass', 
       FUN=function(...){
         .cdsw_compute_sampling_breaks(...,maxwindow = mws)
       })
})op <-par(mfrow=c(4,3))
res <- lapply(c(20,25,30, 40), function(nbins){
  cdsw(exampledata, 
       freq=TRUE,
       plot=TRUE,
       n=nbins,
       overlap = 0, 
       main = paste("nr bins", nbins), xlab='pepmass', 
       FUN=function(...){
         .cdsw_compute_sampling_breaks(...,maxwindow = 100)
       })
})Here is the output of sessionInfo() on the system on which this document was compiled:
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.3 LTS
## 
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.6-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.6-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] specL_1.12.0    seqinr_3.4-5    RSQLite_2.0     protViz_0.2.37 
## [5] DBI_0.7         BiocStyle_2.6.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.13    bookdown_0.5    digest_0.6.12   rprojroot_1.2  
##  [5] backports_1.1.1 magrittr_1.5    evaluate_0.10.1 rlang_0.1.2    
##  [9] stringi_1.1.5   blob_1.1.0      rmarkdown_1.6   tools_3.4.2    
## [13] ade4_1.7-8      bit64_0.9-7     stringr_1.2.0   bit_1.1-12     
## [17] yaml_2.1.14     compiler_3.4.2  memoise_1.1.0   htmltools_0.3.6
## [21] knitr_1.17      tibble_1.3.4