## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  out.width = "100%"
)

## ----setup--------------------------------------------------------------------
### Load library and example data ----
library(realTimeloads)
InputData <- realTimeloads::ExampleData
?realTimeloads::ExampleData

## ----Process and regress data-------------------------------------------------
### Assign list items to variables used in functions ------------------------ 
Site <- InputData$Site
ADCP <- InputData$ADCP
Echo_Intensity_Beam_1 <- InputData$Echo_Intensity
Echo_Intensity_Beam_2 <- InputData$Echo_Intensity # example code assumes backscatter is equal across beams; however, backscatter between beams will differ in practice 
Sonde <- InputData$Sonde
Height <- InputData$Height
Discharge <- InputData$Discharge
Sediment_Samples <- InputData$Sediment_Samples

### Process acoustic backscatter data  --------------------------------------
ADCPOutput <- realTimeloads::acoustic_backscatter_processing(Site,ADCP,Height,
                                                             Sonde,Echo_Intensity_Beam_1,Echo_Intensity_Beam_2)

### Compute estimate of analyte timeseries using https://doi.org/10.1029/2007WR006088 --------

# threshold (minutes) for interpolation of Surrogate to Analyte timeseries
threshold <- 30

# Surrogate "X" (e.g, acoustic backscatter and/or turbidity)
tx <- ADCPOutput$time
x <-ADCPOutput$mean_sediment_corrected_backscatter_beam_1_and_beam_2_dB

# Analyte "y" (e.g., TSS, NOx, etc)
ty <- Sediment_Samples$time
y <- Sediment_Samples$SSCpt_mg_per_liter

# interpolate surrogate onto analyte timeseries
# Randomly sample timeseries n times to simulate sampling that would occur in the field 
n <- 50
ind_calibration <-sample(1:length(ty),n,replace=FALSE)
calibration <- realTimeloads::surrogate_to_analyte_interpolation(tx,x,
                                                            ty[ind_calibration],y[ind_calibration],30)

# Specify calibration variable names for later storage in regression data
names(calibration)[grepl('surrogate',names(calibration))] <- 'SNR_dB'
names(calibration)[grepl('analyte',names(calibration))] <- 'SSCpt_mg_per_liter'

# compute bootstrap regression parameters per https://doi.org/10.1029/2007WR006088
fun_eq <- "log10(SSCpt_mg_per_liter) ~ SNR_dB"

Regression <- realTimeloads::bootstrap_regression(calibration,fun_eq)

# predict concentration timeseries with uncertainty
Surrogate <- data.frame(ADCPOutput$time,
                        ADCPOutput$mean_sediment_corrected_backscatter_beam_1_and_beam_2_dB)
colnames(Surrogate) <- c('time','SNR_dB')

Estimated_Concentration <- realTimeloads::estimate_timeseries(Surrogate,Regression)

## ----Compute load-------------------------------------------------------------
### Interpolate discharge onto surrogate time series (e.g., Backscatter t.s) ----
# All data in ExampleData are on the same time-step, however in practice this may not be the case
Q <- realTimeloads::linear_interpolation_with_time_limit(Discharge$time,
                                                         Discharge$Discharge_m_cubed_per_s,tx,threshold = 1)$x_interpolated

# compute dt (second) for load integration
dt =  c()
dt[2:length(tx)] <- as.numeric(difftime(tx[2:length(tx)],tx[1:length(tx)-1],units = "secs"))
dt[1] = median(dt,na.rm=TRUE) # assume time step 1 using median dt

# compute load (L) (kt) at each time "t"
# Estimated_Concentration$estimated_timeseries is a matrix with simulated timeseries drawn from Monte Carlo simulations on the bootstrapped regression parameters. Rows are simulations, columns are time. Methods are from https://doi.org/10.1029/2007WR006088. 
iters <- nrow(Regression$regression_parameters) # number of Monte Carlo simulations
Qsi =  matrix(NA, nrow = iters, ncol = length(x))
for (i in 1:iters) {
  Qsi[i,] <- Estimated_Concentration$estimated_timeseries[i,]*Q*dt*1e-9 # assumes concentration is mg/l and discharge is cubic meters per sec, dt is in seconds
}

## ----Compute uncertainty------------------------------------------------------
### Compute uncertainty on concentration and load timeseries ---------------
cQs = rowSums(Qsi,na.rm=TRUE) # total load for each iteration
quants <- c(0.0527, 0.1587, 0.5, 0.8414, 0.9473) # +/- 1 and 2 sigma and median (i.e., reported) estimate

Total_load_kt <- data.frame(t(quantile(cQs,quants,na.rm=TRUE)))
colnames(Total_load_kt) = c('minus_two_sigma_confidence','minus_one_sigma_confidence','median_confidence','plus_one_sigma_confidence','plus_two_sigma_confidence')

Reported_real_time_load_estimate <- Total_load_kt$median_confidence

# explicitly name timeseries with uncertainty for plotting below 
Analyte_concentration_timeseries_mg_per_liter <- Estimated_Concentration$estimated_timeseries_quantiles

Analyte_flux_timeseries_kt <- data.frame(t(apply(Qsi, 2 , quantile , probs = quants , 
                                                 na.rm = TRUE )))

colnames(Analyte_flux_timeseries_kt) = c('minus_two_sigma_confidence','minus_one_sigma_confidence','median_confidence',
                                         'plus_one_sigma_confidence','plus_two_sigma_confidence')

## ----Plot timeseries,fig.dim = c(8,6),dpi=300---------------------------------
### Compare estimate of concentration to actual ----

# actual loads (kt)
SSCxs <- Sediment_Samples$SSCxs_mg_per_liter
SSCpt <- Sediment_Samples$SSCpt_mg_per_liter
Qspt <- SSCpt*Q*dt*1e-9 # load from SSC measured at a point
Qsxs <- SSCxs*Q*dt*1e-9 # load from depth-averaged, velocity-weighted SSC

indp <- seq(from = 1, to = nrow(ADCP), by = 1) # vector for plotting subset of timeseries if desired 

plot(tx[indp],Analyte_concentration_timeseries_mg_per_liter$median_confidence[indp],
     col='red',type = "l",lwd= 2,xlab = "time (AEST)",ylab="Analyte concentration (mg/l)",
     main = "Estimated versus actual concentration",ylim = c(0,1500))
lines(tx[indp],Analyte_concentration_timeseries_mg_per_liter$minus_two_sigma_confidence[indp],
      col='grey',lty = c(5))
lines(tx[indp],Analyte_concentration_timeseries_mg_per_liter$plus_two_sigma_confidence[indp],
      col='grey',lty = c(5))
points(ty[ind_calibration],y[ind_calibration],
       col = 'black',pch = c(21))
lines(tx,Sediment_Samples$SSCpt_mg_per_liter,
      col='black',lwd= 1.5)

legend("topright",legend = c("Estimated concentration","Estimation uncertainty","Actual concentration used in regression","Actual concentration timeseries"),
       lty = c(1, 5, 0,1),col = c('red', 'grey', 'black', 'black'),pch = c(-1,-1,21,-1))

###  Compare estimate of flux to actual ----
plot(tx[indp],Analyte_flux_timeseries_kt$median_confidence[indp]/dt[indp]*1e3,
     col='red',type = "l",lwd= 2,xlab = "time (AEST)",ylab="Analyte flux (ton per second)",
     main = "Estimated versus actual flux",ylim = c(0,20))
lines(tx[indp],Analyte_flux_timeseries_kt$minus_two_sigma_confidence[indp]/dt[indp]*1e3,
      col='grey')
lines(tx[indp],Analyte_flux_timeseries_kt$plus_two_sigma_confidence[indp]/dt[indp]*1e3,
      col='grey')
points(tx[ind_calibration],Qspt[ind_calibration]/dt[ind_calibration]*1e3,col = 'black',
       pch = c(21))
lines(tx,Qspt/dt*1e3,col = 'black',lwd= 1.5)
legend("topright",legend = c("Estimated flux","Estimation uncertainty","Actual flux of data used in regression","Actual flux timeseries"),
       lty = c(1, 5, 0,1),col = c('red', 'grey', 'black', 'black'),pch = c(-1,-1,21,-1))

# check estimated load / modeled load
#mean(cQs)/sum(Qspt,na.rm=TRUE) # relative to Cpt (biased) load
#mean(cQs)/sum(Qsxs,na.rm=TRUE) # relative to Cxs (actual) load

Output <- list("time" = tx,"surrogate_timeseries_used_for_prediction_of_analyte"= df,"regression_data" = calibration,"regression_parameters_estimated_from_bootstrap_resampling" = Regression,"Analyte_concentration_timeseries_mg_per_liter"= Analyte_concentration_timeseries_mg_per_liter,"Dicharge" = Discharge,"Analyte_flux_timeseries_kt" =Analyte_flux_timeseries_kt)

## ----Compare total loads,fig.dim = c(8,8),dpi=300-----------------------------
### Compare total loads in barplot ----
# check ratios of estimated load to modeled load
#Total_load_kt$median_confidence/sum(Qspt,na.rm=TRUE) # estimated load relative to load at Cpt
#Total_load_kt$median_confidence/sum(Qsxs,na.rm=TRUE) # estimate total load relative to actual load from Cxs

loads <- c('Estimated load from SSCpt(dB)'= Total_load_kt$median_confidence,'Load from SSCpt' = sum(Qspt,na.rm=TRUE),'Load from SSCxs' = sum(Qsxs,na.rm=TRUE))

y <- c(Total_load_kt$median_confidence)
y0 <- c(Total_load_kt$plus_two_sigma_confidence)
y1 <- c(Total_load_kt$minus_two_sigma_confidence)
  
hb <- barplot(loads,main="Total load comparison",
        ylab="Total load (kt)",border=NA,ylim = c(0,8000))
arrows(x0=hb[1],y0=y0,y1=y1,angle=90,code=3,length=0.1)


## ----simulate missing data and impute data,fig.dim = c(8,8),dpi=300-----------

### Impute data ----
xo <- ADCPOutput$mean_sediment_corrected_backscatter_beam_1_and_beam_2_dB
# simulate 50% missing data
idata <- sample(which(is.finite(xo)),round(sum(is.finite(xo))*0.50),replace=FALSE)
x <- rep(NA,length(xo))
x[idata] <- xo[idata]
flow_ratio <- imputeTS::na_interpolation(Q/x)
Xreg <- cbind(Q,flow_ratio)
# random sampling of training set in impute_data() can generate warnings in decision tree algorithm 
# In impute_data(), ptrain = 1 can be set to preclude uncertainty estimation and decrease code run time. 
# ptrain=0.8 indicates that 80% of the finite data in x will be used in training regression trees and 20% of the data will be held-out for computing validation accuracy
suppressWarnings(Imputation <- impute_data(tx,x,Xreg,MC=10,ptrain=0.8))

ximputed <- Imputation$Imputed_data$x
ximputed[is.finite(x)] <- NA

plot(xo,ximputed,xlab = 'Backscatter (dB)', ylab = 'Imputed backscatter (dB)',ylim = c(min(xo,na.rm = T),max(xo,na.rm = T)),xlim = c(min(xo,na.rm = T),max(xo,na.rm = T)))
points(xo,Imputation$Imputed_data$x_at_minus_two_sigma_confidence,col='red')
points(xo,Imputation$Imputed_data$x_at_plus_two_sigma_confidence,col='blue')
lines(c(min(xo,na.rm=TRUE),max(xo,na.rm=TRUE)))
legend("topleft",legend = c("Median imputed value","Plus 2-sigma uncertainty on imputed value","Minus 2-sigma uncertainty on imputed value"), lty = c(0, 0,0),col = c('black', 'red', 'blue'),pch = c(21,21,21))