## ----logo, echo = FALSE, out.width = '20%', fig.align = 'right', out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px; display: inline-block; float: right;margin: 50px 0px 0px 20px;"'----
knitr::include_graphics("logo.jpg")

## ----eval = TRUE--------------------------------------------------------------
citation(package = "prospectr")

## ----NIRsoil, tidy = TRUE, message = FALSE------------------------------------
library(prospectr)
data(NIRsoil)
# NIRsoil is a data.frame with 825 obs and 5 variables: 
# Nt (Total Nitrogen), Ciso (Carbon), CEC (Cation Exchange Capacity), 
# train (vector of 0,1 indicating training (1) and validation (0) samples),
# spc (spectral matrix)
str(NIRsoil)

## ----movin, fig.cap = "Effect of a moving average with window size of 11 bands on a raw spectrum", fig.height = 4, fig.width = 6, fig.retina = 1, out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# add some noise
noisy <- NIRsoil$spc + rnorm(length(NIRsoil$spc), 0, 0.001) 
# Plot the first spectrum
plot(x = as.numeric(colnames(NIRsoil$spc)),
     y = noisy[1, ],
     type = "l",
     lwd = 1.5,
     xlab = "Wavelength", 
     ylab = "Absorbance") 
X <- movav(X = noisy, w = 11) # window size of 11 bands
# Note that the 5 first and last bands are lost in the process
lines(x = as.numeric(colnames(X)), y = X[1,], lwd = 1.5, col = "red")
grid()
legend("topleft", 
       legend = c("raw", "moving average"), 
       lty = c(1, 1), col = c("black", "red"))

## ----savits, tidy=TRUE--------------------------------------------------------
# p = polynomial order
# w = window size (must be odd)
# m = m-th derivative (0 = smoothing)
# The function accepts vectors, data.frames or matrices.
# For a matrix input, observations should be arranged row-wise
sgvec <- savitzkyGolay(X = NIRsoil$spc[1,], p = 3, w = 11, m = 0) 
sg <- savitzkyGolay(X = NIRsoil$spc, p = 3, w = 11, m = 0) 
# note that bands at the edges of the spectral matrix are lost !
dim(NIRsoil$spc)
dim(sg)

## ----d1,fig.cap="Effect of first derivative and second derivative", fig.height = 4, fig.width = 6, fig.retina = 1, out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# X = wavelength
# Y = spectral matrix
# n = order
d1 <- t(diff(t(NIRsoil$spc), differences = 1)) # first derivative
d2 <- t(diff(t(NIRsoil$spc), differences = 2)) # second derivative
plot(as.numeric(colnames(d1)), 
     d1[1,], 
     type = "l", 
     lwd = 1.5, 
     xlab = "Wavelength", 
     ylab = "")
lines(as.numeric(colnames(d2)), d2[1,], lwd = 1.5, col = "red")
grid()
legend("topleft", 
       legend = c("1st der", "2nd der"), 
       lty = c(1, 1),
       col = c("black", "red"))

## ----gapder,fig.cap="Effect of 1st-order gap derivative ", fig.height = 4, fig.width = 6, fig.retina = 1, out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# first derivative with a gap of 10 bands
gd1 <- t(diff(t(NIRsoil$spc), differences = 1, lag = 10)) 

## ----gapseg,fig.cap="Effect of 1st-order gap-segment derivative ", fig.height = 4, fig.width = 6, fig.retina = 1, out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# m = order of the derivative
# w = gap size
# s = segment size
# first derivative with a gap of 5 bands
gsd1 <- gapDer(X = NIRsoil$spc, m = 1, w = 11, s = 5) 
plot(as.numeric(colnames(d1)), 
     d1[1,], 
     type = "l", 
     lwd = 1.5, 
     xlab = "Wavelength", 
     ylab = "")
lines(as.numeric(colnames(gsd1)), gsd1[1,], lwd = 1.5, col = "red")
grid()
legend("topleft",
       legend = c("1st der","gap-segment 1st der"), 
       lty = c(1,1), 
       col = c("black", "red"))

## ----snv, eval=TRUE, fig.cap="Effect of SNV on raw spectra", fig.height = 4, fig.width = 6, fig.retina = 1, out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
snv <- standardNormalVariate(X = NIRsoil$spc)

## ----msc, fig.cap="Effect of MSC on raw spectra", fig.height = 4, fig.width = 6, fig.retina = 1, out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# X = input spectral matrix
msc_spc <- msc(X = NIRsoil$spc, ref_spectrum = colMeans(NIRsoil$spc))

plot(as.numeric(colnames(NIRsoil$spc)), 
     NIRsoil$spc[1,], 
     type = "l", 
     xlab = "Wavelength, nm", ylab = "Absorbance", 
     lwd = 1.5)
lines(as.numeric(colnames(NIRsoil$spc)), 
      msc_spc[1,], 
      lwd = 1.5, col = "red")
axis(4, col = "red")
grid()
legend("topleft", 
       legend = c("raw", "MSC signal"), 
       lty = c(1, 1),
       col = c("black", "red"))
par(new = FALSE)

## ----eval = FALSE-------------------------------------------------------------
# # a reference set of spectra
# Xr <- NIRsoil$spc[NIRsoil$train == 1, ]
# 
# # an "unseen" set of spectra
# Xu <- NIRsoil$spc[NIRsoil$train == 0, ]
# 
# # apply msc to Xr
# Xr_msc <- msc(Xr)
# 
# # apply the same msc to Xu
# attr(Xr_msc, "Reference spectrum") # use this info from the previous object
# 
# Xu_msc <- msc(Xu, ref_spectrum = attr(Xr_msc, "Reference spectrum"))

## ----detrend, fig.cap="Effect of SNV-Detrend on raw spectra",fig.height = 4, fig.width = 6, fig.retina = 1, out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# X = input spectral matrix
# wav = band centers
dt <- detrend(X = NIRsoil$spc, wav = as.numeric(colnames(NIRsoil$spc)))
plot(as.numeric(colnames(NIRsoil$spc)), 
     NIRsoil$spc[1,], 
     type = "l", 
     xlab = "Wavelength", 
     ylab = "Absorbance", 
     lwd = 1.5)
par(new = TRUE)
plot(dt[1,], 
     xaxt = "n", 
     yaxt = "n",
     xlab = "", 
     ylab = "", 
     lwd = 1.5, 
     col = "red", 
     type = "l")
axis(4, col = "red")
grid()
legend("topleft", 
       legend = c("raw", "detrend signal"), 
       lty = c(1, 1),
       col = c("black", "red"))
par(new = FALSE)

## ----baselined, fig.cap = "Absorbance spectra, baseline spectra and baseline corrected spectra", fig.height = 4, fig.width = 6, fig.retina = 1, out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
data(NIRsoil)

wav <- as.numeric(colnames(NIRsoil$spc))
# plot of the 3 first absorbance spectra
matplot(wav,
        t(NIRsoil$spc[1:3, ]),
        type = "l",
        ylim = c(0, .6),
        xlab = "Wavelength /nm",
        ylab = "Absorbance")
grid()
 
bs <- baseline(NIRsoil$spc, wav)
matlines(wav, t(bs[1:3, ]))
 
fitted_baselines <- attr(bs, "baselines")
matlines(wav, t(fitted_baselines[1:3, ]))

## ----bscale, tidy = TRUE------------------------------------------------------
# X = spectral matrix
# type = "soft" or "hard"
# The ouptut is a list with the scaled matrix (Xscaled) and the divisor (f)
bs <- blockScale(X = NIRsoil$spc, type = "hard")$Xscaled
sum(apply(bs, 2, var)) # this works!

## ----bnorm, tidy = TRUE-------------------------------------------------------
# X = spectral matrix
# targetnorm = desired norm for X
bn <- blockNorm(X = NIRsoil$spc,targetnorm = 1)$Xscaled
sum(bn^2) # this works!

## ----cr, fig.cap = "Absorbance and continuum-removed absorbance spectra", fig.height = 4, fig.width = 6, fig.retina = 1, out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# type of data: 'R' for reflectance (default), 'A' for absorbance
cr <- continuumRemoval(X = NIRsoil$spc, type = "A")
# plot of the 10 first abs spectra
matplot(as.numeric(colnames(NIRsoil$spc)),
        t(NIRsoil$spc[1:3,]),
        type = "l",
        lty = 1,
        ylim = c(0,.6),
        xlab="Wavelength /nm", 
        ylab="Absorbance")
matlines(as.numeric(colnames(NIRsoil$spc)), lty = 1, t(cr[1:3, ]))
grid()

## ----naes, fig.cap = "Selection of 5 samples by k-means sampling", fig.height = 4.5, fig.width = 4, fig.retina = 1, fig.align ='center', out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# X = the input matrix
# k = number of calibration samples to be selected
# pc = if pc is specified, k-mean is performed in the pc space 
# (here we will use only the two 1st pcs)
# iter.max =  maximum number of iterations allowed for the k-means clustering.
kms <- naes(X = NIRsoil$spc, k = 5, pc = 2, iter.max = 100)
# Plot the pcs scores and clusters
plot(kms$pc, col = rgb(0, 0, 0, 0.3), pch = 19, main = "k-means") 
grid()
# Add the selected points
points(kms$pc[kms$model, ], col = "red", pch = 19)

## ----ken, fig.cap="Selection of 40 calibration samples with the Kennard-Stone algorithm", fig.height = 4.5, fig.width = 4, fig.retina = 1, fig.align ='center', out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# Create a dataset for illustrating how the calibration sampling 
# algorithms work
X <- data.frame(x1 = rnorm(1000), x2 = rnorm(1000))
plot(X, col = rgb(0, 0, 0, 0.3), pch = 19, main = "Kennard-Stone (synthetic)") 
grid()
# kenStone produces a list with row index of the points selected for calibration
ken <- kenStone(X, k = 40) 
# plot selected points
points(X[ken$model,], col = "red", pch = 19, cex = 1.4) 

## ----ken2, fig.cap="Kennard-Stone sampling on the NIRsoil dataset", fig.height = 4.5, fig.width = 4, fig.retina = 1, fig.align = 'center', out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----

# Test with the NIRsoil dataset
# one can also use the mahalanobis distance (metric argument)
# computed in the pc space (pc argument)
ken_mahal <- kenStone(X = NIRsoil$spc, k = 20, metric = "mahal", pc = 2)
# The pc components in the output list stores the pc scores
plot(ken_mahal$pc[,1], 
     ken_mahal$pc[,2], 
     col = rgb(0, 0, 0, 0.3), 
     pch = 19, 
     xlab = "PC1",
     ylab = "PC2",
     main = "Kennard-Stone") 
grid()
# This is the selected points in the pc space
points(ken_mahal$pc[ken_mahal$model, 1], 
       ken_mahal$pc[ken_mahal$model,2], 
       pch = 19, col = "red") 

## ----ksinitalization, tidy = TRUE---------------------------------------------
# Indices of the initialization samples
initialization_ind <- c(486, 702, 722, 728) 
ken_mahal_init <- kenStone(
  X = NIRsoil$spc, 
  k = 20, 
  metric = "mahal", 
  pc = 2, 
  init = initialization_ind)

ken_mahal_init$model

## ----ken3, fig.cap="Kennard-Stone sampling with initialization samples", fig.height = 4.5, fig.width = 4, fig.retina = 1, fig.align = 'center', out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----

# The pc components in the output list stores the pc scores
plot(ken_mahal_init$pc[,1], 
     ken_mahal_init$pc[,2], 
     col = rgb(0, 0, 0, 0.3), 
     pch = 19, 
     xlab = "PC1",
     ylab = "PC2",
     main = "Kennard-Stone with 4 initialization samples") 
grid()
# This is the selected points in the pc space
points(ken_mahal$pc[ken_mahal$model, 1], 
       ken_mahal$pc[ken_mahal$model, 2], 
       pch = 19, col = "red") 

# Our initialization samples
points(ken_mahal$pc[initialization_ind, 1], 
       ken_mahal$pc[initialization_ind, 2], 
       pch = 19, cex = 1.5, col = "blue") 


## ----duplex, fig.cap="Selection of 15 calibration and validation samples with the DUPLEX algorithm", fig.height = 4.5, fig.width = 4, fig.retina = 1, fig.align = 'center', out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
dup <- duplex(X = X, k = 15) # k is the number of selected samples
plot(X, col = rgb(0, 0, 0, 0.3), pch = 19, main = "DUPLEX") 
grid()
# calibration samples
points(X[dup$model, 1], X[dup$model, 2], col = "red", pch = 19) 
# validation samples
points(X[dup$test,1], X[dup$test,2], col = "dodgerblue", pch = 19) 
legend("topright", 
       legend = c("calibration", "validation"), 
       pch = 19, 
       col = c("red", "dodgerblue"))

## ----shenk, fig.cap="Selection of samples with the SELECT algorithm", fig.height = 4.5, fig.width = 4, fig.retina = 1, fig.align = 'center', out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
shenk <- shenkWest(X = NIRsoil$spc, d.min = 0.6, pc = 2)
plot(shenk$pc, col = rgb(0, 0, 0, 0.3), pch = 19, main = "SELECT") 
grid()
points(shenk$pc[shenk$model,], col = "red", pch = 19)

## ----puchwein, fig.cap="Samples selected by the Puchwein algorithm", fig.height = 4.5, fig.width = 4, fig.retina = 1, fig.align = 'center', out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
pu <- puchwein(X = NIRsoil$spc, k = 0.2, pc =2)
plot(pu$pc, col = rgb(0, 0, 0, 0.3), pch = 19, main = "puchwein") 
grid()
points(pu$pc[pu$model,],col = "red", pch = 19) # selected samples

## ----puchwein2, fig.cap="How to find the optimal loop", eval = FALSE----------
# par(mfrow = c(2, 1))
# plot(pu$leverage$removed,pu$leverage$diff,
#      type = "l",
#      xlab = "# samples removed",
#      ylab="Difference between th. and obs sum of leverages")
# # This basically shows that the first loop is optimal
# plot(pu$leverage$loop,nrow(NIRsoil) - pu$leverage$removed,
#      xlab = "# loops",
#      ylab = "# samples kept", type = "l")
# par(mfrow = c(1, 1))

## ----honigs, fig.cap="Spectra selected with the Honigs algorithm and bands used", fig.height = 4.5, fig.width = 4, fig.retina = 1, fig.align = 'center', out.extra='style= "background-color: #FFFFFF; border: 10px solid transparent; padding:0px"'----
# type = "A" is for absorbance data
ho <- honigs(X = NIRsoil$spc, k = 10, type = "A") 
# plot calibration spectra
matplot(as.numeric(colnames(NIRsoil$spc)),
        t(NIRsoil$spc[ho$model,]),
        type = "l",
        xlab = "Wavelength", ylab = "Absorbance")
# add bands used during the selection process
abline(v = as.numeric(colnames(NIRsoil$spc))[ho$bands], lty = 2)