\documentclass{article}

\usepackage{amsmath}

\usepackage{natbib}
\usepackage{indentfirst}

\DeclareMathOperator{\logit}{logit}

% \VignetteIndexEntry{smallarea}

\begin{document}

\title{An overview of Fay Herriot model with our package smallarea}
\author{Abhishek Nandy}
\maketitle

\section{The Fay Herriot Model}

\subsection{Model Notations}

The Fay Herriot model can be written as follows:

\begin{equation}
y_{i}=x_{i}^T\beta +v_{i} +e_{i} \ \ \ \ i=1,...,m
\end{equation}

where $x_{i}$ is a vector of known covariates, $\beta$ is a vector 
of unknown regresson coefficients, $v_{i}$'s are area specific random 
effects and $e_{i}$'s represent sampling errors. It is assumed that 
the $v_{i} \sim N(o,\psi)$ and $e_{j} \sim N(0,D_{j})$ are independent for all pairs (i,j).
We further assume that $\psi$ is unknown but the $D_{i}$'s are known.
Each of the i's correspond to a small area.
The ultimate goal of this package is to fit the fay herriot model to a given data
and return estimates of several parameters noteworthy of which are the small area means,
and mean square error of the small are mean.
Below we give the definitions of the quantities of interest.


\subsection{Vector Notation of the model}

\begin{equation}
y=X\beta + v +\epsilon
\end{equation}                                                                                                                                                                                                                              
where $v \sim N_{m}(0,\psi I)$ and $\epsilon \sim N_{m}(0,D)$ where $N_{m}$ stands for the 
m dimensional Multivariate Normal Distribution $I$ is the $m \times m$ identity matrix
and $D$ is an $m \times m$ diagonal matrix with elements $D_{1},...,D_{m}$. Also we will call
the variance covariance matrix of Y as  V, a diagonal matrix with $i-th$ diagonal element 
= $\psi + D_{i}$


\subsection{Variance component}
There is a more general definition of variance component, but for our purposes $\psi$,
the variance of the random effects is the variance component.


\subsection{estimate of the variance component}
There are four popular ways of estimating the variance components
\begin{itemize}
\item The Prasad Rao estimator denoted by $\widehat{\psi}_{PR}$ has the form $\frac{y^T(I-P_{X})y - tr((I-P_{X})D)}{m-p}$ 
where $P_{X}$=$X(X^TX)^{-1}X^T$ is the projection matrix onto the column space of X. 
\item The Fay Herriot estimator denoted by $\widehat{\psi}_{FH}$ is obtained by 
solving the equation \begin{equation}\frac{y^TQ(\psi)y}{m-p}=1 \end{equation} iteratively, where 

\begin{equation*}
y^TQ(\psi)y=\sum_{i=1}^m\frac{(y_{i}-x_{i}^T\beta)^2}{\psi +D_{i}}
\end{equation*}

We will use unirrot function in the stats package of R to solve the equation.


\item The Maximum likelihood estimator denoted by $\widehat{\psi}_{ML}$ is obtained by maximizing the likelihood
function. Note that in our model assumptions we have already incorporated 
normality assumption. So the likelihood would be a normal likelihood. hence it
should be made sure while fitting the model that the normality assumptions are met.


\item  The REML (aka Residual Maximum Likelihood aka Restricted Maximum Likelihood)
encompasses the main drawback of the likelihood approach of estimating by adjusting 
for the degrees of freedom involved in estimating the fixed effects. In broad generality
By multiplying with vectors $k$ that belong to the null space of the matrix $X$ on both
sides of equation 2, we can get

\begin{equation*}
k^Ty=k^Tv + kTv
\end{equation*}

But this makes us vulnerable to the option that the estimator might depend on the choice of
the vector k. However that problem can be resolved by finding REML estimators as a solution to the
REML equations. For details see Searle, Casella, McCulloch (Wiley, 2006), sec 6.6. We then use the scoring 
method to solve the REML equations. Without going into the general theory of the REML we will write down
the equation for implementation of the scoring algorithm here. The iterates in fisher scoring are given by the following formula

\begin{equation}
\psi^{(n+1)}=\psi^{(n)} + [I_{R}\psi^{(n)}]^{-1}s_{R}^{-1}(\psi^{(n)})
\end{equation}

where 
\begin{eqnarray*}
I_{R}(\psi)=\frac{1}{2}tr(PBPB)\\
s_{R}(\psi)=-\frac{1}{2}tr(PB) + \frac{1}{2}y^TPBPy\\
P=V^{-1}-V^{-1}X(X^TV^{-1}X)^{-1}X^TV^{-1}\\
\end{eqnarray*} 

where V is the variance covariance matrix of Y, a diagonal matrix with $i-th$ diagonal element = $\psi + D_{i}$.
There is a more general definition of B (see the references), But for our purposes B is the Identity matrix.
\end{itemize}

\subsection{Estimates of the Regression Coefficients}
Estimates of the regression coefficients are given by the formula $(X^TV^{-1}X)^{-1}X^TV^{-1}y$ and is denoted by $\tilde{\beta}$.
However note that $V$ is a function of unknown $\psi$ and known $D_{i}$'s. So $\psi$'s have to be estimated as we discussed in the
section 1.2.5. we will often denote by $\tilde{\beta}$ by $\tilde{\beta}(\psi)$ when the V in the above formula is has not been estimated from data,
whereas if the V in the above formula has been estimated from data(denote by $\widehat{V}$), 
it will be denoted by $\tilde{\beta}(\widehat{\psi})$ and sometimes by $\widehat{\beta}$.
In Fay herriot Model the expressions of $\tilde{\beta}$ is 
$(\sum\limits_{i=1}^n\frac{x_{i}x_{i}^T}{\psi + D_{i}})^{-1}(\sum\limits_{i=1}^n\frac{x_{i}y_{i}}{\psi + D_{i}})$
and that $\widehat{\beta}$ is the same expression with $\psi$ replaced by $\widehat{\psi}$
\subsection{Small Area mean}
The small area mean is a quantity $\eta$=$X \beta + v$ for our purposes.\\
The small area mean for the $i$-th small area is thus $\eta_{i}$=$x_{i}^T \beta + v_{i}$

\subsection{The estimated Small Area mean: The BLUP and EBLUP}
If $\beta$ and $\psi$ are both known the best predictor of $\eta$ is given by 

\begin{eqnarray*}
E(\eta|y)=X\beta + E(v|y)\\
=X\beta +\begin{pmatrix}
\frac{\psi}{\psi + D_{1}}(y_{1} - x_{1}^T\beta)\\
.\\
.\\
.\\
\frac{\psi}{\psi + D_{m}}(y_{m} - x_{m}^T\beta)
\end{pmatrix}
\end{eqnarray*}

However if only $\beta$ is unknown, it is customary to replace $\beta$ in the above expression by $\tilde{\beta}$
and the result will be called BLUP.

Further if both $\beta$ and $\psi$ are unknown, we replace $\psi$ by $\widehat{\psi}$ and $\beta$ by $\widehat{\beta}$ and
the result will be called EBLUP.

\subsection{Estimate of the Mean Squared Prediction error(MSPE) of EBLUP}
Another feature of our package is that it gives second order unbiassed estimator of the mean squared prediction error(MSPE) of the EBLUP of the small area means. The MSPE of the EBLUP of the $i-th$ small area is given by
\begin{eqnarray*}
MSPE(\widehat{\eta}_{i})&=&E(\tilde{\eta}_{B,i}-\eta_{i})^2 + E(\tilde{\eta}_{i}-\tilde{\eta}_{B,i})^2 + E(\widehat{\eta}_{i} - \tilde{\eta}_{i})^2\\
&&\text{where} \ \ \ \tilde{\eta}_{B,i}=\frac{\psi}{\psi + D_{i}}y_{i} -\frac{D_{i}}{\psi + D_{i}}x_{i}^T\beta \\
&=&g_{1i}(\psi)+g_{2i}(\psi)+E(\widehat{\eta}_{i} - \tilde{\eta}_{i})^2 \\
\end{eqnarray*}
Further analytic expressions can be obtained of the first two terms
\begin{eqnarray*}
g_{1i}(\psi)&=&\frac{\psi D_{i}}{\psi + D_{i}}\\
g_{2i}(\psi)&=&(\frac{D_{i}}{\psi + D_{i}})^2 x_{i}^T[\sum\limits_{j=1}^n(\frac{x_{j}x_{j}^T}{\psi + D_{j}})]^{-1}x_{i}
\end{eqnarray*}
Note that $g_{1i}(\psi) = O(1)$ and $g_{2i}(\psi)= O(m^{-1})$
And the third term can be written as 
\begin{equation*}
E(\widehat{\eta} - \tilde{\eta})^2 = g_{3i}(\psi) + o(m^{-1})
\end{equation*}

The expression of $g_{3i}$ varies from case to case. We list below the different expressions
\begin{eqnarray*}
g_{3i}(\psi)&=&\frac{2D_{i}^2}{(\psi + D_{i})^3 m^2}\sum\limits_{j=1}^m (\psi + D_{j})^2 \ \ \ \ \text{for Prasad Rao}\\
&=&\frac{2D_{i}^2}{(\psi + D_{i})^3}[\sum\limits_{j=1}^m (\psi + D_{j})^{-2}]^{-1} \ \ \ \ \text{for ML and REML}\\
&=& \frac{2D_{i}^2m}{(\psi + D_{i})^3}[\sum\limits_{j=1}^m (\psi + D_{j})^{-1}]^{-2} \ \ \ \text{for Fay Herriot}
\end{eqnarray*}

Finally we list below the second order approximated estimates of the MSPE we described above
\begin{eqnarray*}
\widehat{MSPE(\widehat{\eta}_{i})} &=& g_{1i}(\widehat{\psi}_{PR}) + g_{2i}(\widehat{\psi}_{PR}) + 2g_{3i,PR}(\widehat{\psi}_{PR}) \ \ \ \text{for prasad rao}\\
\widehat{MSPE(\widehat{\eta}_{i})} &=& g_{1i}(\widehat{\psi}_{REML}) + g_{2i}(\widehat{\psi}_{REML}) + 2g_{3i,REML}(\widehat{\psi}_{REML}) \ \ \  \text{for REML}\\
 &=& g_{1i}(\widehat{\psi}_{ML}) + g_{2i}(\widehat{\psi}_{ML}) + 2g_{3i,ML}(\widehat{\psi}_{ML}) \\
&& -(\frac{D_{i}}{\widehat{\psi}_{ML}+ D_{i}})^2 \lbrace\sum\limits_{j=1}^m (\widehat{\psi}_{ML}+D_{j})^{-2}\rbrace^{-1} \\
&& \times tr\lbrace (\sum\limits_{j=1}^m \frac{x_{j}x_{j}^T}{\widehat{\psi}_{ML}+D_{j}})^{-1} \sum\limits_{j=1}^m\frac{x_{j}x_{j}^T}{(\widehat{\psi}_{ML}+ D_{j})^2} \rbrace \ \ \ \   \text{for ML}\\
\widehat{MSPE(\widehat{\eta}_{i})}&=& g_{1i}(\widehat{\psi}_{FH}) + g_{2i}(\widehat{\psi}_{FH}) + 2g_{3i,FH}(\widehat{\psi}_{FH}) \\
&&-2(\frac{D_{i}}{\widehat{\psi}_{FH}+D_{i}})^2 \lbrace \sum\limits_{j=1}^m(\widehat{\psi}_{FH} + D_{j})^{-1} \rbrace^{-3} \\
&& \times [m\sum\limits_{j=1}^m(\widehat{\psi}_{FH} + D_{j})^{-2} - \lbrace \sum\limits_{j=1}^m(\widehat{\psi}_{FH} + D_{j})^{-1} \rbrace^2]
\end{eqnarray*}

For the details on the derivation of these results please see the references.
\section{Examples}
Now let us look at a simple examples. 
\subsection{First Example: prasadraoest}
In the first example we will demonstrate what our function prasadraoest does.
This is an auxiliary function of our package. So it has not been designed to be very user friendly as its main purpose is to 
compute the prasad rao estimate of variance component $\psi$ mentioned in section 1.4. The arguements that this function takes are as follows
\begin{itemize}
\item response : This is the y vector. This must be a numeric vector.
\item desgnmatrix: This is the matrix X (matrix of covariates with the first column needs to have all entries equal to one). This must be a numeric matrix.
\item sampling.var: This is the vector consisting of the $D_{i}$values. This must be a numeric vector.
\end{itemize}
<<First Example>>=
library(smallarea)
response=c(1,2,3,4,5)  # response vector
designmatrix=cbind(c(1,1,1,1,1),c(1,2,4,4,1),c(2,1,3,1,5)) 
# designmatrix with 5 rows and 3 columns, 
# the first column has all entries equal to one 
sampling.var=c(0.5,0.7,0.8,0.4,0.5) 
# This is the vector of sampling variances
answer=prasadraoest(response,designmatrix,sampling.var)
answer
answer$estimate
@
This function returns a list with only one element in it the estimate of the variance component
\subsection{Second example: fayherriot}
This function is pretty similar to the \texttt{prasadraoest} function in appearance, i.e takes 
the same arguments (example 1 gives a description of the arguments in details)  and returns 
a list with only one element, the estimate of the variance 
component, except for the fact thatthe method of estimation is the one that was proposed 
by Fay Herriot, the formula is given in section 1.4. That is the estimate is obtained by 
numerically solving the equation (3) using the \texttt{uniroot} function in the \texttt{stats}
package in R. Uniroot searches for the root of that equation in an interval which we have specified
as $(0,\widehat{\psi}_{PR}+3\sqrt{m})$. If no root is found in that interval, we have truncated our
$\widehat{\psi}_{FH}$ at 0.0001 as suggested by Datta, Rao Smith (2005). To demonstrate the working 
of this function, we have used the same data as in \texttt{prasadraoest}
<<Second Example>>=
response=c(1,2,3,4,5)
designmatrix=cbind(c(1,1,1,1,1),c(1,2,4,4,1),c(2,1,3,1,5))
sampling.var=c(0.5,0.7,0.8,0.4,0.5)
fayherriot(response,designmatrix,sampling.var)
@
\subsection{Third Example : maximlikelihood}
This is an example which demonstrates the function\texttt{maximlikelihood}. The arguments are 
again same as the last two examples. It returns a list that has three elements 
\begin{itemize}
\item estimate: is $\widehat{\psi}_{ML}$.
\item reg.coefficients: a vector of the MLE of the regression coefficients i.e. $\beta$ vector
\item loglikeli.optimum : the value of the log likelihood function at the maximized value.
\end{itemize}
we have used the \texttt{optim} function in the \texttt{stats} package in R and used the BFGS algorithm
to minimize the negative log likelihood. The maximum number of iterations of the algorithm is 100(the default in optim).
Since the likelihhod function is a normal likelihood, care should be taken to check the normality assumptions
of the data. Here the response has been generted from a normal distribution with mean 3 and standard deviation
1.5. The designmatrix and the sampling variances are however kept the same as in the last two examples.
<<Third Example>>=
set.seed(55)
response=rnorm(5,3,1.5)
designmatrix=cbind(c(1,1,1,1,1),c(1,2,4,4,1),c(2,1,3,1,5))
sampling.var=c(0.5,0.7,0.8,0.4,0.5)
maximlikelihood(response,designmatrix,sampling.var)
@
\subsection{Fourth Example: resimaxilikelihood}
The getup of this function is again very similar to the first two examples, except here there is one 
additional argument \texttt{maxiter} wherein the user can specify the maximum number of Fisher scoring 
iterations. Also here using Fisher's scoring iteration described in equation (4) in section 1.4
we find the REML of the variance component. A description of the data used is already given in third example. It returns
a list of two elements 
\begin{itemize}
\item estimate: $\widehat{\psi}_{REML}$
\item iterations: number of fisher scoring iterations
\end{itemize}
<<Fourth Example>>=
set.seed(55)
response=rnorm(5,3,1.5)
designmatrix=cbind(c(1,1,1,1,1),c(1,2,4,4,1),c(2,1,3,1,5))
sampling.var=c(0.5,0.7,0.8,0.4,0.5)
resimaxilikelihood(response,designmatrix,sampling.var,maxiter=100)
@ 
\subsection{Fifth Example: smallareafit}
This is the main function in our library. It takes the following arguments
\begin{itemize}
\item formula: a formula similar in appearance to that of in \texttt{lm} function in R.
You have to make sure that the data contains a column of the sampling variances, and that
while specifying the formula the the name of the variable that contains the sampling variances
should preceede the variables which are the covariates. e.g In the following example
\texttt{response$\sim$D+x1+x2} is a correct way of specifying the formula where as 
\texttt{response$\sim$x1+D+x2} is not.(note D is the variabe that contains the values of
sampling variances and x1 and x2 are covariates). In general the first of the variables
on the right hand side of $\sim$ will be treated as the vector of sampling variance.
\item data : It is an optional data.frame. In absense of this argument our function will 
accept variables from  the global environment
\item method : It can be one of the four methods discussed in this article and can be
any one of the following options "PR","FH","ML","REML"
\end{itemize}
In usage of each of the four methods, the following will be the output. The function will return a list that has the following
\begin{itemize}
\item smallmean.est: a numeric vector of The EBLUP of the small area means discussed in section 1.7
\item smallmean.mse: a numeric vector of The estimated Mean squared Prediction error $\widehat{MSPE(\widehat{\eta}_{i})}$ discussed in section 1.8
\item var.comp : an estimate of the variance component discussed in section 1.4
\item est.coef: a numeric vector containg the estimates of the regression coefficients as discussed in section 1.5 and 
is denoted by $\widehat{\beta}$ beginning with the intercept in the order as specified in the formula.
\end{itemize}
The following example just illustrates what we have just discussed. Also note that the maximum number of Fisher scoring iterations have been set to 100
for the REML proceedure.
<<Fifth Example>>==
data=data.frame(response=rnorm(5,3,1.5),
x1=c(1,2,4,4,1),x2=c(2,1,3,1,5),D=c(0.5,0.7,0.8,0.4,0.5))
data
ans=smallareafit(response~D+x1+x2,data,method="FH")
ans1=smallareafit(response~D+x1+x2,data,method="REML")
ans2=smallareafit(response~D+x1+x2,data,method="PR")
ans3=smallareafit(response~D+x1+x2,data,method="ML")
ans # FH method
ans1 # REML method
ans2 # PR method
ans3 # ML method
@
\subsection{An example where smallarea accepts variables from the global environment}
Our final example is that of usage of small area function that accepts variables from Global environment
<<Fifth Example>>==
data=data.frame(response=rnorm(5,3,1.5),
x1=c(1,2,4,4,1),D=c(0.5,0.7,0.8,0.4,0.5))
attach(data)
ans=smallareafit(response~D+x1,method="FH")
ans1=smallareafit(response~D+x1,method="REML")
ans2=smallareafit(response~D+x1,method="PR")
ans3=smallareafit(response~D+x1,method="ML")
ans
ans1
ans2
ans3
@
\subsection{A final example}
Now we will actually use a more interesting example taken forom the paper Reference 1
.We take the total number of small areas, n=15, $\psi=1$ and three sampling variance,
 $D_{i}$-patterns: 0.7, 0.6, 0.5, 0.4, 0.3;  There are five groups 
$G_{1}, . . . , G_{5}$ and three small areas in each group.
The sampling variances $D_{i}$ are the same for areas within the same group.  
We will investigate through simulation for the area level model 
without covariates $x_{i}^T\beta = \mu$. Since the mean squared
error is translation invariant, we set $\mu=0$ without loss of generality. 
However, to account for the estimation of unknown regression parameters 
that arise in applications, we will still
estimate this zero mean. We will consider distributions for the variance 
components $v_{i}$'s, namely N(0, 1). The sampling error, $e_{i}$, will 
be generated from $N(0, D_{i})$ for $D_{i}$ as specified above. 
<<Sixth example>>=
set.seed(55)
# the sampling variances
D=c(rep(0.7,3),rep(0.6,3),rep(0.5,3),rep(0.4,3),rep(0.3,3))
# generating the errors
e1=rnorm(3,0,sqrt(D[1]))
e2=rnorm(3,0,sqrt(D[4]))
e3=rnorm(3,0,sqrt(D[7]))
e4=rnorm(3,0,sqrt(D[10]))
e5=rnorm(3,0,sqrt(D[13]))
e=c(e1,e2,e3,e4,e5)
psi=1
# generating the random small area effects
v=rnorm(15,0,psi)
# response  
y=v+e
data1=data.frame(response=y,D=D)
head(data1)
fit1.pr=smallareafit(response~D,data1,method="PR")
fit1.pr
@
We have used only Prasad Rao method in the last example, similarly
other methods can also specified.
\section{References}
\begin{itemize}
\item Datta-Rao-Smith \textbf{On measuring the variability of small area estimators under a basic area level model} , \textit{Biometrika} (2005), 92, 1, pp. 183-196.
\item Large Sample Techniques for Statistics, Jiming Jiang, Springer Texts in Statistics 
\item REML Estimation: Asymptotic Behavior and Related Topics, \textit{Annals of Statistics,1996, Vol 24, no.1, 255-286}, Jiang
\item Small Area Estimation, JNK Rao, Wiley 2003.
\item Variance Component, Searle, Casella, McCulloch, 2006, Wiley Series in Probability and Statistics
\end{itemize}
\section{Acknowledgments}
\begin{itemize}
\item \textbf{Snigdhansu Chatterjee, School of Statistics, University of Minnesota.}
\item \textbf{Galin Jones, School of Statistics, University of Minnesota.}
\item \textbf{Charles Geyer, School of Statistics, University of Minnesota}
\end{itemize}

\end{document}