---
title: "Gravity model"
dev: "png"
fig_caption: true
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Gravity model}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---


```{r setup, echo=FALSE, include=TRUE, eval=TRUE}
knitr::opts_chunk$set(
  echo = TRUE,
  eval = TRUE,
  cache = TRUE,
  tidy = FALSE,
  warning = FALSE,
  error = FALSE
)

library(rlist)
library(data.table)
library(foreach)

```

```{r, echo=FALSE}

# The good old original R code to run SIM that was used before cppSim

### this file contains the functions used to run the gravity model

cost_function <- function(d, beta, type = "exp") {
  # d is the od distance matrix, taking the exponential
  # means doing this operation for each individual element
  #  with an exponent beta
  # the type parameter allows to set it to either exp or pow.
  #  pow means we use a power function as cost, rather than exponential

  if (type == "exp") {
    exp(-beta * d)
  } else if (type == "pow") {
    d^(-beta)
  } else {
    print("provide a type of functino to compute")
  }
}

r_2 <- function(d, f) {
  cor(
    d |> as.numeric(),
    f |> as.numeric()
  )^2
}

r <- function(d, f) {
  cor(
    d |> as.numeric(),
    f |> as.numeric()
  )
}

rmse <- function(d, f) sum((d - f)^2)

calibration <- function(cost_fun, O, D, delta = 0.05) {
  B <- rep_len(1, nrow(cost_fun))
  eps <- abs(sum(B))
  e <- NULL
  i <- 0
  while ((eps > delta) & (i < 50)) {
    A_new <- 1 / (apply(cost_fun, function(x) sum(B * D * x), MARGIN = 1))
    B_new <- 1 / (apply(cost_fun, function(x) sum(A_new * O * x), MARGIN = 2))
    eps <- abs(sum(B_new - B))
    e <- append(e, eps)
    A <- A_new
    B <- B_new
    i <- i + 1
  }
  list(
    "A" = A,
    "B" = B,
    "e" = e
  )
}

run_model <- function(
    flows,
    distance,
    beta = 0.25,
    type = "exp"
    # ,cores = 3
    ) {
  F_c <- cost_function(d = {{ distance }}, beta = {{ beta }}, type = type)
  print("cost function computed")
  O <- apply(flows, sum, MARGIN = 1) |> as.integer()
  D <- apply(flows, sum, MARGIN = 2) |> as.integer()
  A_B <- calibration(
    cost_fun = F_c,
    O = O,
    D = D,
    delta = .001
  )
  print("calibration: over")
  A <- A_B$A
  B <- A_B$B

  flows_model <- foreach(
    j = c(1:nrow(F_c)),
    .combine = rbind
  ) %do% {
    round(A[j] * B * O[j] * D * F_c[j, ])
  }

  print("model run: over")
  e_sor <- e_sorensen(flows, flows_model) |> as.numeric()
  print(paste0("E_sor = ", e_sor))
  r2 <- r_2(flows_model, flows) |> as.numeric()
  print(paste0("r2 = ", r2))
  RMSE <- rmse(flows_model, flows) |> as.numeric()
  print(paste0("RMSE = ", RMSE))

  list(
    "values" = flows_model,
    "r2" = r2,
    "rmse" = RMSE,
    "calib" = A_B$e,
    "e_sor" = e_sor
  )
}

### Validation

e_sorensen <- function(data, fit) {
  2 * sum(apply(cbind(
    data |> c(),
    fit |> c()
  ), MARGIN = 1, FUN = min)) / (sum(data) + sum(fit))
}


```


```{r load_data, echo=FALSE}
library(cppSim)

data("distance_test")
data("flows_test")

distance_test <- (distance_test / 1000 / 14) |> round()

```

This file will cover the process of building a local version of the gravity model used to predict cycling and/or walking flows across London. 

# What model to use ?

The general version of the doubly constrained gravity model looks the following way: 
$$ T_{ij} = A_iB_jO_iD_jf(c_{ij}) $$ where $O_i$ is the working population of the origin, and $D_j$ is the available workplaces at the destination location:
$$ O_i = \sum_j T_{ij} $$
$$ D_j = \sum_i T_{ij} $$

The terms $A_i$,$B_j$ are factors for each location. 
The derivation of these factors is based on the relation: 

$$A_i = [\sum_j B_jD_jf(c_{ij})]^{-1}$$

$$B_j = [\sum_i A_iO_if(c_{ij})]^{-1}$$

with the derivation made from a recursive chain with initial values 1. Let's refer to the parameters above as vectors $\vec{A}$,$\vec{B}$,$\vec{O}$,$\vec{D}$, and to the cost function and flow as matrices **F** and **T** such that $F_{ij}=f(c_{ij})$ and $T_{ij}$ is a flow from i to j. 


```{r}
# creating the O, D vectors. 
O <- apply(flows_test, sum, MARGIN = 2) |> c()

D <- apply(flows_test, sum, MARGIN = 1) |> c()
```


```{r}
F_c <- cost_function(distance_test,1,type = "exp")

```


Next, we need to run the recursive procedure until the values stabilise. We introduce the threshold at which we will stop running the recursion $\delta$. It corresponds to the rate of change of the parameter with respect to the previous iteration. 


```{r}
beta_calib <- foreach::foreach(i = 28:33
                               ,.combine = rbind) %do% {
                                 beta <- 0.1*(i - 1)
                                 print(paste0("RUNNING MODEL FOR beta = ",beta))
                                 run <- run_model(flows = flows_test
                                                  ,distance = distance_test
                                                  ,beta = beta
                                                  ,type = "exp"
                                 )
                             
                                 cbind(beta, run$r2,run$rmse)
                               }

```


```{r}

plot(beta_calib[,1]
     ,beta_calib[,2]
     ,xlab = "beta value"
     ,ylab = "quality of fit, r"
     ,main = "influence of beta on the goodness of fit"
     ,pch = 19
     ,cex = 0.5
     ,type = "b")


```


```{r}

beta_best_fit <- beta_calib[which(beta_calib[,2] == max(beta_calib[,2])),1]
x <- seq_len(100)/20
plot(x
     ,exp(-beta_best_fit*x)
     ,main = "cost function"
     ,xlab = "distance, km"
     ,ylab = "decay factor"
     ,pch = 19
     ,cex = 0.5
     ,type = "l")

```


```{r}

run_best_fit <- run_model(flows = flows_test
                 ,distance = distance_test
                 ,beta = beta_best_fit
                 ,type = "exp"
                 )

```


```{r}

plot(seq_along(run_best_fit$calib)
     ,run_best_fit$calib
     ,xlab = "iteration"
     ,ylab = "error"
     ,main = "calibration of balancing factors"
     ,pch = 19
     ,cex = 0.5
     ,type = "b"
     
)


```

```{r}
plot(flows_test
     ,run_best_fit$values
     ,ylab = "flows model"
     ,xlab = "flows"
     ,log = "xy"
     ,pch = 19
     ,cex = 0.5)
lines(seq_len(max(run_best_fit$values))
      ,seq_len(max(run_best_fit$values))
      ,col = "darkred"
      ,lwd = 2)

```


```{r}

## MODEL USING THE GLM And POISSON DISTRIBUTION

# flows_london <- rlist::list.load("flows_london.rds")

data(flows_london)

flows_london <- flows_london

sample_od <- sample(unique(flows_london$workplace),100)

flows_grav <- flows_london[(workplace %in% sample_od) & (residence %in% sample_od),]

flows_grav[,O := sum(bike),by = from_id]

flows_grav[,D := sum(bike), by = to_id]

#

model <- glm(bike ~ workplace+residence+distance -1
             ,data = flows_grav
             ,family = poisson(link = "log")
             )

((model$fitted.values - flows_grav$bike)) |> hist(breaks = 100)

r2 <- r_2(flows_grav$bike,model$fitted.values)
r2

```
# Support functions
```{r}

print("cost function:")
cost_function

print("calibration function:")
calibration

print("model run")
run_model

```