--- title: "Gravity model" dev: "png" fig_caption: true output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Gravity model} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, echo=FALSE, include=TRUE, eval=TRUE} knitr::opts_chunk$set( echo = TRUE, eval = TRUE, cache = TRUE, tidy = FALSE, warning = FALSE, error = FALSE ) library(rlist) library(data.table) library(foreach) ``` ```{r, echo=FALSE} # The good old original R code to run SIM that was used before cppSim ### this file contains the functions used to run the gravity model cost_function <- function(d, beta, type = "exp") { # d is the od distance matrix, taking the exponential # means doing this operation for each individual element # with an exponent beta # the type parameter allows to set it to either exp or pow. # pow means we use a power function as cost, rather than exponential if (type == "exp") { exp(-beta * d) } else if (type == "pow") { d^(-beta) } else { print("provide a type of functino to compute") } } r_2 <- function(d, f) { cor( d |> as.numeric(), f |> as.numeric() )^2 } r <- function(d, f) { cor( d |> as.numeric(), f |> as.numeric() ) } rmse <- function(d, f) sum((d - f)^2) calibration <- function(cost_fun, O, D, delta = 0.05) { B <- rep_len(1, nrow(cost_fun)) eps <- abs(sum(B)) e <- NULL i <- 0 while ((eps > delta) & (i < 50)) { A_new <- 1 / (apply(cost_fun, function(x) sum(B * D * x), MARGIN = 1)) B_new <- 1 / (apply(cost_fun, function(x) sum(A_new * O * x), MARGIN = 2)) eps <- abs(sum(B_new - B)) e <- append(e, eps) A <- A_new B <- B_new i <- i + 1 } list( "A" = A, "B" = B, "e" = e ) } run_model <- function( flows, distance, beta = 0.25, type = "exp" # ,cores = 3 ) { F_c <- cost_function(d = {{ distance }}, beta = {{ beta }}, type = type) print("cost function computed") O <- apply(flows, sum, MARGIN = 1) |> as.integer() D <- apply(flows, sum, MARGIN = 2) |> as.integer() A_B <- calibration( cost_fun = F_c, O = O, D = D, delta = .001 ) print("calibration: over") A <- A_B$A B <- A_B$B flows_model <- foreach( j = c(1:nrow(F_c)), .combine = rbind ) %do% { round(A[j] * B * O[j] * D * F_c[j, ]) } print("model run: over") e_sor <- e_sorensen(flows, flows_model) |> as.numeric() print(paste0("E_sor = ", e_sor)) r2 <- r_2(flows_model, flows) |> as.numeric() print(paste0("r2 = ", r2)) RMSE <- rmse(flows_model, flows) |> as.numeric() print(paste0("RMSE = ", RMSE)) list( "values" = flows_model, "r2" = r2, "rmse" = RMSE, "calib" = A_B$e, "e_sor" = e_sor ) } ### Validation e_sorensen <- function(data, fit) { 2 * sum(apply(cbind( data |> c(), fit |> c() ), MARGIN = 1, FUN = min)) / (sum(data) + sum(fit)) } ``` ```{r load_data, echo=FALSE} library(cppSim) data("distance_test") data("flows_test") distance_test <- (distance_test / 1000 / 14) |> round() ``` This file will cover the process of building a local version of the gravity model used to predict cycling and/or walking flows across London. # What model to use ? The general version of the doubly constrained gravity model looks the following way: $$ T_{ij} = A_iB_jO_iD_jf(c_{ij}) $$ where $O_i$ is the working population of the origin, and $D_j$ is the available workplaces at the destination location: $$ O_i = \sum_j T_{ij} $$ $$ D_j = \sum_i T_{ij} $$ The terms $A_i$,$B_j$ are factors for each location. The derivation of these factors is based on the relation: $$A_i = [\sum_j B_jD_jf(c_{ij})]^{-1}$$ $$B_j = [\sum_i A_iO_if(c_{ij})]^{-1}$$ with the derivation made from a recursive chain with initial values 1. Let's refer to the parameters above as vectors $\vec{A}$,$\vec{B}$,$\vec{O}$,$\vec{D}$, and to the cost function and flow as matrices **F** and **T** such that $F_{ij}=f(c_{ij})$ and $T_{ij}$ is a flow from i to j. ```{r} # creating the O, D vectors. O <- apply(flows_test, sum, MARGIN = 2) |> c() D <- apply(flows_test, sum, MARGIN = 1) |> c() ``` ```{r} F_c <- cost_function(distance_test,1,type = "exp") ``` Next, we need to run the recursive procedure until the values stabilise. We introduce the threshold at which we will stop running the recursion $\delta$. It corresponds to the rate of change of the parameter with respect to the previous iteration. ```{r} beta_calib <- foreach::foreach(i = 28:33 ,.combine = rbind) %do% { beta <- 0.1*(i - 1) print(paste0("RUNNING MODEL FOR beta = ",beta)) run <- run_model(flows = flows_test ,distance = distance_test ,beta = beta ,type = "exp" ) cbind(beta, run$r2,run$rmse) } ``` ```{r} plot(beta_calib[,1] ,beta_calib[,2] ,xlab = "beta value" ,ylab = "quality of fit, r" ,main = "influence of beta on the goodness of fit" ,pch = 19 ,cex = 0.5 ,type = "b") ``` ```{r} beta_best_fit <- beta_calib[which(beta_calib[,2] == max(beta_calib[,2])),1] x <- seq_len(100)/20 plot(x ,exp(-beta_best_fit*x) ,main = "cost function" ,xlab = "distance, km" ,ylab = "decay factor" ,pch = 19 ,cex = 0.5 ,type = "l") ``` ```{r} run_best_fit <- run_model(flows = flows_test ,distance = distance_test ,beta = beta_best_fit ,type = "exp" ) ``` ```{r} plot(seq_along(run_best_fit$calib) ,run_best_fit$calib ,xlab = "iteration" ,ylab = "error" ,main = "calibration of balancing factors" ,pch = 19 ,cex = 0.5 ,type = "b" ) ``` ```{r} plot(flows_test ,run_best_fit$values ,ylab = "flows model" ,xlab = "flows" ,log = "xy" ,pch = 19 ,cex = 0.5) lines(seq_len(max(run_best_fit$values)) ,seq_len(max(run_best_fit$values)) ,col = "darkred" ,lwd = 2) ``` ```{r} ## MODEL USING THE GLM And POISSON DISTRIBUTION # flows_london <- rlist::list.load("flows_london.rds") data(flows_london) flows_london <- flows_london sample_od <- sample(unique(flows_london$workplace),100) flows_grav <- flows_london[(workplace %in% sample_od) & (residence %in% sample_od),] flows_grav[,O := sum(bike),by = from_id] flows_grav[,D := sum(bike), by = to_id] # model <- glm(bike ~ workplace+residence+distance -1 ,data = flows_grav ,family = poisson(link = "log") ) ((model$fitted.values - flows_grav$bike)) |> hist(breaks = 100) r2 <- r_2(flows_grav$bike,model$fitted.values) r2 ``` # Support functions ```{r} print("cost function:") cost_function print("calibration function:") calibration print("model run") run_model ```