\documentclass[a4paper]{article}
\usepackage{Sweave}
\usepackage{lscape}
\usepackage{color}
\usepackage{colordvi}
\usepackage{graphicx}
\usepackage[round]{natbib}

\title{Nonparametric maximum likelihood estimation for random effect models in R\\
\vspace{1cm}
\large Vignette to R package {\bf npmlreg}}
\vspace{2cm}
\author{Jochen Einbeck and John Hinde}

% R CMD build npmlreg --compact-vignettes

%  in R:
%  library(utils)  % Sweave
%  Sweave("npml-R2.Stex")
%  in WinEdit:
%  latex npml-R2.tex
%  in R:
%  Stangle("npmlreg.Stex")
%\VignetteIndexEntry{NPML estimation for random effect models in R} 
%\VignetteDepends{npmlreg, MASS, nlme}
%\VignetteKeyword{NPML}


\begin{document}
\begin{landscape}
\pagestyle{plain}


\maketitle
\thispagestyle{empty}


\newpage
\tableofcontents

\newpage

\section{Introduction}

Nonparametric maximum likelihood (NPML) estimation is an attractive tool for the fitting 
of generalized linear models with random effects, 
which can be considered as a special case of generalized linear mixed models (GLMMs).  
One crucial advantage of the NPML approach is that the random effect distribution does not need 
to be specified a priori, whereas the huge body of literature on GLMMs restricts nearly exclusively 
on normally distributed random effects. Further, complicated integrations are avoided by approxmating 
the marginal likelihood by a simple finite mixture, for which standard fitting algorithms based on EM 
exist and can be applied. NPML for generalized linear models with random effects was previously implemented 
by Aitkin \& Francis (1995) in the GLIM4 language, which is however no longer widely used. The main functions 
of this package,  {\tt alldist}  (for overdispersion) and {\tt allvc} (for variance component models), 
are  modified and extended versions of their homonymous counterparts in GLIM4, and have been translated 
to R originally by Ross Darnell.


In this handbook the concept of NPML estimation is briefly explained (Section \ref{method}) and
 a variety of  data examples are given (Section \ref{examples}), which illustrate the functionalities 
 of {\tt alldist} and {\tt allvc}. The  R package {\bf npmlreg} is available for download on CRAN at
\begin{center}
{\tt http://cran.r-project.org}.
\end{center}

 
\vspace{0.3cm}
{\em Key Words:}
 Varying coefficient models, random effect models, mixed models, mixture models,  Gaussian Quadrature, EM algorithm, Two-level models,
 exponential family regression models.
   

\section{Random effect modelling with exponential family mixtures}\label{method}


Assume there is given a set of explanatory vectors $x_1, \ldots,
x_n$ and a set of observations  $y_1,\ldots,y_n$ sampled from an
exponential family distribution\footnote{In the present
implementation of {\tt alldist}, Gausssian, Poisson, Binomial, Gamma and Inverse Gaussian
distributed response are supported.}  $f(y_i|\beta,\phi_i)$ with dispersion
parameter $\phi_i$.  In a generalized linear model, predictors and
response are assumed to be related through a link function $h$,
\[
\mu_i\equiv E(y_i|\beta,\phi_i)= h(\eta_i)\equiv h(x_i^{\prime}\beta),
\]
and the variance $Var(y_i|\beta,\phi_i)= \phi_i v(\mu_i)$ depends on a
function $v(\mu_i)$ which is entirely determined by the choice of
the particular exponential family. However, often the actual
variance in the data is larger than the variance according to this
strict mean-variance relationship. This effect is commonly called
overdispersion. Reasons for overdispersion might be e.g.
correlation in the data or important explanatory variables not
included in the model. In order to account for additional
unexplained  variability of the individual observations,
 a random effect $z_i$ with density $g(z)$ is
included  into the linear predictor\footnote{We refer to a model 
defined in this manner as a {\it generalized linear model with random effect}, or shorter, 
{\it random effect model},  whereas the  more general linear predictor
 $\eta_i=\beta'x_i+\gamma'_i\tilde{x_i}$, with $\gamma_i$ random and $\tilde{x_i}$ 
 typically being a subvector of $x_i$,
entails a {\it generalized linear mixed model}.} 
\[
\eta_i=\beta'x_i+z_i.
\]
The marginal likelihood can now be
written as
\begin{equation}
\label{marginal}
L=\prod_{i=1}^n\int f(y_i|z_i,\beta,\phi_i)g(z_i)\,dz_i
\end{equation}
and can be approximated by a finite mixture
\[
\prod_{i=1}^n\left\{\sum_{k=1}^Kf(y_i|z_k,\beta,\phi_k)\pi_k\right\} \equiv \prod_{i=1}^n\left\{\sum_{k=1}^Kf_{ik}\pi_k\right\},
\]
where $z_k$ are the mass points and $\pi_k$ their masses. The log-likelihood is then given by
\begin{equation}
\label{ell} \ell=
\sum_{i=1}^n\log\left\{\sum_{k=1}^K\pi_kf_{ik}\right\}.
\end{equation}
The score equations
\begin{equation}
\label{score} \frac{\partial \ell}{\partial z_k}=0,\quad \frac{\partial
\ell}{\partial \beta}=0, \quad\frac{\partial \ell}{\partial
\phi_k}=0,
\end{equation}
turn out to be weighted versions of the single-distribution score
equations, with weights
  \begin{equation}
  \label{weights}
  w_{ik}=\frac{\pi_kf_{ik}}{\sum_{\ell}\pi_{\ell}f_{i\ell}}.
  \end{equation}

The weights $w_{ik}$ can be interpreted as posterior probabilities
that the observation $y_i$ comes from component $k$. The score
equation for the mixture proportions,
\[
\quad \frac{\partial \ell-\lambda(\sum\pi_k-1)}{\partial \pi_k}=0,
\]
gives the ML estimate
\begin{equation}
\label{pik} \hat{\pi}_k=\frac{1}{n}\sum_iw_{ik}
\end{equation}
which can be nicely interpreted as the average posterior
probability for component $k$. The parameters $\phi_k$, $\beta$,
$z_k$ and $\pi_k$ can now be simultaneously estimated by an
standard EM algorithm:
\begin{description}
\item[{\bf Starting points}] Select starting values $\phi^{(0)}$, $\beta^{(0)}$, $z^{(0)}_k$, and $\pi^{(0)}_k$, $k=1, \ldots, K$.
\item[{\bf E-Step}] Adjust weights using formula (\ref{weights}) with current parameter estimates.
\item[{\bf M-Step}] Update parameter estimates fitting a weighted GLM with weights $w_{ik}$, including mass points as dummy variables.
\end{description}

In the special case of a normally distributed random effect, one can employ
tabulated Gauss-Hermite integration points  and their corresponding masses for $z_k$ and $\pi_k$, respectively, and consider these
values as constants (Hinde, 1982).
Otherwise, they have to be calculated simultaneously during the EM algorithm as outlined above (one then usually takes the
GH points/masses as starting points, which are scaled outwards ({\tt tol >1 }) or inwards ({\tt tol <1 }) by means of a scaling parameter {\tt tol}). 
As in this case no parametric specification of the random effect distribution is necessary,
one refers to this method as `Nonparametric Maximum Likelihood' (NPML) estimation (Laird, 1978), which was adapted to the framework
of overdispersed generalized linear models by Aitkin (1996a).  In difference to the original implementation in GLIM4, we use a 'damping' procedure in the initial cycles of the algorithm, which
reduces the sensitivity of the EM algorithm to the optimal choice of {\tt tol} for exponential
family densities possessing a dispersion parameter (as Gaussian or Gamma). For  technical details on the implementation of the algorithm, 
see Einbeck \& Hinde (2006).  


\section{Examples}\label{examples}

\subsection{Finite Gaussian mixtures: The galaxy data}

The data considered in this example are the recession velocities (in $km/s$) of 82 galaxies receding from our own, sampled from six well-separated conic sections of space.
The full data were given by Postman et al. (1986).  They are part of the R package {\bf MASS} as data set {\tt galaxies}.  Note that, in this dataset, 
there is a typo in the 78th observation, which should be 26960 instead of 26690. We correct this to obtain consistent and 
comparable results  with those presented in Aitkin et al. (2005) and other references.

%1
\begin{Scode}
data(galaxies, package="MASS")
galaxies[78]<-26960
gal<-as.data.frame(galaxies)
rm(galaxies)
\end{Scode}

Next, we construct a new  variable {\tt v1000} from {\tt galaxies}, which  represents  
the velocity in units of $10^3km/s$:

%2
\begin{Scode}
gal$v1000<- gal$galaxies/1000
gal$v1000
\end{Scode}

\noindent and load the {\bf npmlreg} package:

%3
\begin{Scode}
library(npmlreg)
\end{Scode}

Fitting a simple constant normal model yields
%4
\begin{Scode}
glm(v1000~1,data=gal)
\end{Scode}
\noindent which is the same as a NPML estimation with one mass point, fitting a `mixture' of one normal component: 
%5
\begin{Scode}
(galaxy.np1 <- alldist(v1000~1,random=~1,random.distribution='np',k=1,data=gal))
\end{Scode}
The option {\tt data=....} is mandatory, even if the data frame was {\tt attach}ed to the workspace!
%Note that for the normal distribution with one mass point, the disparity $Disp=-2log L$ and the deviance $D=-2\sigma^2( log L- log L_{sat})$
%have the simple relationship
%\[
%Disp= n\left\{1+\log(2\pi) +log\left(\frac{D}{n}\right)\right\},
%\]
%where $log$ is the natural logarithm and $n$ the number of  observations. Indeed, defining the deviance/disparity calculator
%{\tt dev2disp<-function(D,n)\{
%n*(1+log(2*pi)+log(D/n)) \}}
%one gets 
The deviance can be obtained by 
%6
\begin{Scode}
galaxy.np1$dev
\end{Scode}

\noindent which is certainly the same as for the GLM. Next, we fit discrete mixtures $\sum_{k=1}^K\pi_kf_k$, where the $f_k$ are normal densities with expectation 
$\mu_k$ and unknown, but equal variances $\sigma^2=\sigma^2_k$.
Fitting models with $K=2, 3, 4$, and 5 mass points, one obtains
%7
\begin{Scode}
(galaxy.np2 <- alldist(v1000~1,random=~1,random.distribution='np',k=2,data=gal))
(galaxy.np3 <- alldist(v1000~1,random=~1,random.distribution='np',k=3,data=gal))
(galaxy.np4 <- alldist(v1000~1,random=~1,random.distribution='np',k=4,data=gal))
\end{Scode}
and observes a steady decrease in disparity, i.e. $-2\log L$. As a by-product, 
the {\tt alldist} routine produces a plot 
showing how the disparity converges (Fig. 1 top), and another plot showing 
the EM trajectories (Fig. 1 bottom).

%8
\begin{minipage}{21cm}
%\includegraphics[height=13cm,width=11cm]{4forweb.ps}
\begin{Scode}{fig=TRUE, echo=FALSE, height=5}
plot(galaxy.np4, plot.opt=3)
\end{Scode}

Fig. 1: Convergence of EM algorithm for NPML estimation with 4 mass points.  Top: Disparities; Bottom: EM Trajectories.
\end{minipage}

As {\tt random.distribution='np'} is the default setting, it can  be omitted. For 5 to 9 mass points, we only report the disparity values
%9
\begin{Scode}
(galaxy.np5 <- alldist(v1000~1,random=~1,k=5,data=gal, verbose=FALSE))$disp
(galaxy.np6 <- alldist(v1000~1,random=~1,k=6,tol=0.2,data=gal,verbose=FALSE))$disp
(galaxy.np7 <- alldist(v1000~1,random=~1,k=7,tol=0.12,data=gal,verbose=FALSE))$disp
(galaxy.np8 <- alldist(v1000~1,random=~1,k=8,tol=0.2,data=gal,verbose=FALSE))$disp
(galaxy.np9 <- alldist(v1000~1,random=~1,k=9,tol=0.06,data=gal,verbose=FALSE))$disp
\end{Scode}
indicating that the disparity stabilizes at about 8 mass points. Note that in some cases it was necessary to modify the optional parameter {\tt tol} 
to obtain the disparity values given above. The {\tt tol} parameter influences the position of the starting points, where
values {\tt tol} $< 1$ mean concentrated values compared to the default setting (Gaussian quadrature points).  The disparity values
for 2 and 5 mass points are better than those obtained by Aitkin (2001) with GLIM 4.  
One reason for that is the applied damping procedure: As the algorithm is less sensitive to the optimal choice of {\tt tol}, 
the optimal solutions are found more easily.  An assisting tool in the selection of {\tt tol} is the R function {\tt tolfind} included in  
the package {\bf npmlreg}. 


To fit a Gaussian mixture with unequal standard deviations $\sigma_k$, $k=1, \ldots, K$ varying over the components, the possibility of smoothing the standard 
deviations among components is implemented. Smoothing is performed by means of the discrete kernel
\[
W(x,y|\lambda)=\left\{\begin{array}{lcc}\lambda &\mbox{if}  & y=x\\
                                        (1-\lambda)/(K-1)   &\mbox{if}  &y \not= x\\
                       \end{array}\right.
                        \]
(Aitchison and Aitken, 1976). The setting $\lambda=1/K$  corresponds to the extreme case `maximal smoothing' (i.e. equal 
variances $\sigma^2=\sigma^2_k$.), while $\lambda = 1$ means that all standard deviations are calculated within the
    components (i.e. unequal variances $\sigma^2_k$).   Statistically sensible settings are only
    $1/K \le  \lambda  \le 1$.  The default setting $\lambda= 0$ is automatically mapped to 
$\lambda =1/K$.  

 As an example, we compute the four  mass-points model with option {\tt lambda=1}
%10
\begin{Scode}
summary(galaxy.np4u <- alldist(v1000~1, random=~1, k=4, tol=0.5, data=gal, lambda=1, verbose=FALSE))
\end{Scode}

One gets deeper insight into the fitted model looking at diagnostic plots. Calling

%11
\begin{Scode}{fig=FALSE, eval=FALSE, echo=TRUE}{height=4}
plot(galaxy.np4u, plot.opt=15, height=5)
\end{Scode}

\noindent gives the disparities and EM trajectories as above (Fig. 2 top), and additionally two plots showing the empirical Bayes predictions
vs the true responses, and the component posterior probabilities ($w_{ik}$) against the fixed part residuals ($y_i-x_i\hat{\beta}$), respectively. In the former plot (Fig. 2 left bottom), one sees nicely how  the
predicted values are `flattened' within the custers and smoothed between. In the latter plot (Fig. 2 right bottom), one gets an impression 
of the discriminatory power of the mixture components. Throughout all plots, one colour corresponds to the same particular mass point.

An important remark should be given here: Interpretation (and definition!) of the deviance $D=-2\phi\log L + 2\phi\log L_{saturated}$, provided in {\tt \$deviance}, is not clear when using unequal 
dispersion parameters. In the present implementation, deviances are calculated in this case in a somewhat makeshift manner using the {\it equal} dispersion parameter for the $\phi$ preceding the log-likelihood, and the {\it unequal} dispersion parameters within the log-likelihood.
Hence, it is strictly recommended to work with the disparity rather with the deviance in the case of unequal component dispersion parameters, as the the disparity does not share this problem. This is also
the reason why we prefer to work with disparities in general, and why disparities (and not deviances) are displayed in the summaries.  

%12
\begin{minipage}{21cm}
%\includegraphics[height=13cm,width=15cm]{4uforweb.ps}
\begin{Scode}{fig=TRUE, echo=FALSE,height=4}
plot(galaxy.np4u, plot.opt=15)
\end{Scode}

Fig. 2:  Diagnostic plots for NPML estimation with unequal variances and 4 mass points.  
\end{minipage}


One might fear that for a high number of mass points some component  standard deviations could tend to zero. 
This can indeed be the case.  Using  8 instead of 4  mass points in the call above one gets the error message


\vspace{0.3cm}
{\tt 
\noindent  Error in alldist(v1000 ~ 1, random = ~1, k = 8, tol = 0.5, data = gal,  :\\
\indent    Singularity or Likelihood-Spike at iteration \#12.  \\
\indent\indent     Check model specification,  enable spike protection or smooth among components.  \\
}


%Here {\tt \$sdev} gives the 'overall' standard deviation (calculated over all components given the fitted model), and {\tt \$sdevk} the component specific standard deviation. 
%Attention: The 8th mass point is associated with variance zero, resulting in the useless disparity value -373.2.  Such 'likelihood spikes' (Aitkin, Hinde \& Francis, 2005, p. 428) 
%can produce  arbitrarily high likelihoods and consequently arbitrarlily small disparities, 
%which are certainly not at all meaningful. 

This problem may be solved,  as a first attempt, by modifying {\tt tol}. In this case, {\tt tol=0.32} 
gives a likelihood-spike free solution with disparity  357.8, which is a good part better then the value given in Aitkin (2001), 361.0. 
If likelihood spikes occur for any {\tt tol}, one can enable the spike protection ({\tt spike.protect=1}), which stops the algorithm as soon
as one component starts to enter a likelihood spike.  For instance, running

%13
\begin{Scode}
(galaxy.np8us <- alldist(v1000~1, random=~1, k=8, tol=0.5, data=gal, lambda=1, verbose=FALSE, spike.protect=TRUE))
galaxy.np8us$sdev$sdevk
\end{Scode}

\noindent gives us  estimates of mass points, masses, and standard deviations of the mixture components. These values 
have to be interpreted with care, as the displayed disparity is normally not correct when the algorithm does not have converged. 
One notices from this output that the 8th mass point is responsible for the likelihood spike.

The better approach is to set the smoothing parameter equal to $\lambda=0.99$, 
which corresponds to unequal standard deviations with a very low amount
of smoothing among components:
%14
\begin{Scode}
(galaxy.np8ud <- alldist(v1000~1, random=~1, k=8, tol=0.5, data=gal, lambda=0.99))
galaxy.np8ud$sdev$sdevk
\end{Scode}
The motivation for the implementation of {\tt spike.protect} is mainly to enable to run {\tt tolfind}
without breaking down if likelihood spikes occur. Hence, it is in {\tt alldist} by default switched off, 
and in {\tt tolfind} by default switched on.  The result of {\tt tolfind} for the 8-mass point model with unequal variances 
is shown in Fig. 2: Red circles correspond to {\tt tol} values where the spike protection had to interfere and
 hence the EM algorithm did not converge. Only disparity values associated with green circles are reliable, 
 and the  optimal value of {\tt tol} should consequently be chosen from them.
 
 %15
 \begin{minipage}{21cm}
 \begin{Scode}{fig=TRUE,  height=2, width=4}
par(mfrow=c(1,1), cex=0.65)
tolfind(v1000~1, random=~1, k=8, data=gal, lambda=1, find.in.range=c(0.0,0.6), steps=12, plot.opt=0, verbose=FALSE, noformat=TRUE)[c(3,4)]
\end{Scode}
%\includegraphics[height=6cm,width=11cm]{tolfindgal.eps}

Fig. 3: Disparity against {\tt tol} for 8 mass point model with unequal variances. Green circles: EM converged; red circles: EM not converged.
\end{minipage}
 
\subsection{Compound Poisson models: The fabric faults data}

In this Section we consider the fabric faults data,  previously analyzed in Hinde (1982) and  Aitkin,  Francis \& Hinde (2005, p. 453ff). 
This data describes the number of faults in rolls of fabrics with a single covariate {\tt leng} for the length of the roll. The log-length is directly given by the variable {\tt x}.  
The number of faults {\tt y}  can be assumed to follow a Poisson distribution. First, we fit a generalized linear Poisson model with the natural log link
%15
\begin{Scode}
data(fabric)
(faults0 <- glm(y ~ 1, family=poisson(link=log),data=fabric))
(faults1 <- glm(y ~ x, family=poisson(link=log),data=fabric)) 
\end{Scode}
and observe a large reduction in deviance by including the log length.
Fits of count data with Poisson models are often quite poor, as the basic assumption 
underlying a Poisson model, equality of mean and
variance, is often not adequate. 
As a solution, Hinde (1982) proposed to model the unexplained variation by means of a Gaussian random effect $Z$. In case of the fabric fault data,  
one assumes that the number of faults conditional on the observation and on the
random effect follows a Poisson distribution, i.e.
\[
Y|X_1, \ldots, X_n,Z \sim Po(\mu),
\]
where $Z\sim N(0,1)$, and
\[
log(\mu)=c+ log(leng)+\sigma Z,
\]  
 Integrating out the random effect as in (\ref{marginal}), one obtains a Poisson/normal compound distribution, which can
be approximated with Gaussian quadrature (GQ).  For one, two and three mass points one obtains with the log length as covariate: 
%16
\begin{Scode}
(faults.g1<- alldist(y ~ x, family=poisson(link=log), random=~1, data= fabric,k=1, random.distribution="gq")) 
(faults.g2<- alldist(y ~ x, family=poisson(link=log), random=~1, data= fabric,k=2, random.distribution="gq")) 
(faults.g3<- alldist(y ~ x, family=poisson(link=log), random=~1, data= fabric,k=3, random.distribution="gq",verbose=F)) 
\end{Scode}
The one mass point model is equivalent to the model {\tt faults1} given above, which can also be verified by checking the deviance
%17
\begin{Scode}
faults.g1$dev
\end{Scode}  
For a Poisson model, deviance and disparity are related by the equation
$D=Disp +2L_{sat}$
and are consequently equal up to an additive constant (the double saturated likelihood), which in our case takes  the value $123.30$. Thus, the disparity 
 174.3 for three mass points corresponds exactly to the deviance value of 51.0 reported in Hinde (1982), p. 119. For comparison, one can also fit the two and three mass point models
 with NPML:
 %18
\begin{Scode}
(faults.np2<- alldist(y ~ x, family=poisson(link=log), random=~1, data= fabric,k=2, random.distribution="np")) 
(faults.np3<- alldist(y ~ x, family=poisson(link=log), random=~1, data= fabric,k=3, random.distribution="np",verbose=FALSE)) 
\end{Scode}
The disparities are not far from those obtained with Gaussian quadrature, indicating that the random effect distribution is 
not very far from a normal distribution.  While three mass points seem to be adequate for GQ, only two mass points are needed with NPML.
Note that the use of option {\tt random.distribution="np"} yields an object of type {\tt glmmNPML}, while  option {\tt random.distribution="gq"} yields an object of type {\tt glmmGQ}.
 
 Predictions  for objects of type {\tt glmmGQ} can
 be obtained  by 
%19
\begin{Scode}
predict(faults.g2, type="response",newdata=fabric[1:6,])
\end{Scode}
or
%20
\begin{Scode}
predict(faults.g2, type="response")[1:6]
\end{Scode}
which both call function {\tt predict.glmmGQ}. The results of the two predictions differ, 
since in the first case prediction is done using the analytical mean of the marginal distribution, considering {\tt faults[1:6,]} as `new' input data (Aitkin, Hinde \& Francis,  2005, p. 459),  
and in the second case in an  empirical Bayes approach (Aitkin, 1996a) using the individual posterior probabilities 
obtained as a by-product of the EM algorithm.  
 
 
\subsection{Logistic regression with random effects: The toxoplasmosis data} 
 
The toxoplasmosis data, also called rainfall data, are available via 
%21
\begin{Scode}
data(rainfall)
rainfall$x<-rainfall$Rain/1000  
rainfall$x2<- rainfall$x^2; rainfall$x3<- rainfall$x^3
\end{Scode}  
gives the number of subjects {\tt Cases} out of {\tt Total} testing positively for toxoplasmosis
in each of 34 cities in El Salvador with annual rainfall  {\tt x} in 1000 mm. The data have been 
 analyzed in Efron (1998) using generalized linear models and   in Aitkin \& Francis (1995) using the GLIM 4 implementation of
 NPML. Fitting, as the latter authors, a constant logistic overdispersion model with three mass points, one obtains
  %22
  \begin{Scode}
 (toxo.np<- alldist(cbind(Cases,Total-Cases)~1, random=~1, data=rainfall, k=3, family=binomial(link=logit)))
 \end{Scode} 
 The result is approximately the same as that obtained by Aitkin \& Francis (1995). However, note that the disparity
   %23
   \begin{Scode}
 toxo.np$disparity
 \end{Scode}
 differs from the GLIM 4 result $947.89$. The disparity for binomial models provided by GLIM 4 has to be interpreted as  $-2\log L+c$, where $c$ is some 
 additive constant only depending on the values of {\tt y } and {\tt n}, while the disparity given by this R implementation 
 is just $-2\log L$. 
 Adding rainfall as fixed effect, we fit a linear random effect model
 %24
 \begin{Scode}
 (toxo.npx<- alldist(cbind(Cases,Total-Cases)~x, random=~1, data=rainfall, k=3, family=binomial(link=logit)))
 \end{Scode}
   The decrease in disparity compared to the constant model is only 0.3 on 1df (and is 5.1 for a cubic model, on 3df). %for the linear model and 5.03 on 3df for the cubic model.
 We also try a random slope
 %25
 \begin{Scode}
 (toxo.npxx<- alldist(cbind(Cases,Total-Cases)~x, random=~x, data=rainfall, k=3, family=binomial(link=logit)))
 \end{Scode}
 giving only a negligible decrease in disparity compared to the linear fixed effects model.  All in all, when accounting for overdispersion, 
 there is no  overwhelming evidence that rainfall has a significant influence on the incidence of toxoplasmosis at all.
 
 For the simple constant model, the posteriori probabilities $w_{ik}$
  %26
 \begin{Scode}
  round(t(toxo.np$post.prob),digits=2)
 \end{Scode}
 show how the observations are allocated to the mass points, indicating that actually only one observation (the 30th) 
 represents the 3rd mass point. From the posteriori probabilites, one also obtains the empirical
 Bayes predictions
 $
\tilde{\eta}_i=\sum_k \hat{\eta}_{ik}\hat{w}_{ik}
 $
  as in Aitkin (1996b),  from which the predicted toxoplasmosis incidence probabilities $\tilde{p}_i=\exp(\tilde{\eta}_i)/(1+\exp(\tilde{\eta}_i))$ can be calculated: 
%27
\begin{Scode}
round(toxo.ebp<-toxo.np$ebp,digits=3)
round(exp(toxo.ebp)/(1+exp(toxo.ebp)),digits=4)
 \end{Scode}
This can alternatively be done easier by using the generic predict function,
%28
\begin{Scode}
predict(toxo.np, type="response")
\end{Scode}
or, even quicker,
\begin{Scode}
fitted(toxo.np)
\end{Scode}
 which call function {\tt predict.glmmNPML} for an object of type {\tt glmmNPML}. 
 The predict function can also be used to obtain predictions for new input values, e.g. for the linear random effect model:
%29
 \begin{Scode}
predict(toxo.npx,type="response",newdata=data.frame(x=2))
\end{Scode} 

\subsection{Modelling mixtures of Gamma distributions: The hospital stay data} 
 
The hospital-stay data is a sample from a larger data set collected on persons
discharged from a Pennsylvania hospital as part of a retrospective
chart review of antibiotic use in
hospitals (Rosner, 2000, p. 39). 
Relevant covariates of the data set 
are {\tt temp1} (the first measured temperature following admission,
measured in Fahrenheit) and {\tt age}, and the response is the duration of hospital stay.
We read the data in and fit a three mass point model  via
%30
\begin{Scode}
data(hosp)
(fitnp3<-  alldist(duration~age+temp1, data=hosp,k=3, family=Gamma(link=log),tol=0.2)) 
\end{Scode}
giving the estimated shape parameter
 %31
 \begin{Scode}
 fitnp3$shape
 \end{Scode}

(Certainly, all three component shape parameters listet at {\tt \$shapek} are equal, as by default all components are assumed 
to have the same dispersion parameter.) For comparison, a three mass point mixture of exponentials (i.e. shape=1) is significantly inferior, yielding
%32
 \begin{Scode}
 (fitnp3e<-  alldist(duration~age+temp1, data=hosp,k=3, family=Gamma(link=log),tol=0.2,shape=1))
 \end{Scode}
 
 For a more detailed analysis of this data set see Einbeck \& Hinde (2006).
 
One important remark should still be given. When employing the damped EM algorithm, 
one will very occasionally observe rising disparities, typically from about the 10th to the 30th EM iteration. 
According to the theory on EM, this shouldn't happen. This phenomenon occurs as 
 the damping step modifies the likelihood,  and hence the theoretical basis for EM is not given any more. 
 {\it Damped} EM is actually only {\it asymptotical} EM, 
 as for a large number of iterations the effect of damping vanishes.  It depends on the application
 and on {\tt tol} what is a `large' number of iterations. In general, rising disparities are more likely to be observed for small values of {\tt tol}  than
 for large values of {\tt tol}, and more likely to be observed for Gamma than for Gaussian mixtures.
   Though these rising disparities are not really a problem, as at convergence the number of
 iterations is generally high enough, one might want to avoid them for esthetic reasons. This can be achieved by setting
 the constant {\tt damp.power} in the formula
 \[
 d_j= 1-(1-\mbox{\tt tol})^{\mbox{\tt damp.power*iter}+1}
 \]
 to a value bigger than one, where {\tt iter}$\equiv j$ is the number of iterations and $d_j$ is the multiplicative 
 constant adjusting the dispersion parameter (Einbeck \& Hinde, 2006). Another alternative is certainly to switch off damping
 (option {\tt damp=FALSE)} but then it will not work at all in some cases.
 
 
 \subsection{Variance component models: The Oxford school boys data}
 
 This data set, also analyzed in Goldstein (2003), contains the heights of 26 boys in Oxford, measured on
 nine occasions over two years. The data set is contained in the R library {\tt nlme} and can 
 be loaded and plotted via
 
 %33
 \begin{minipage}{21cm}
 %\includegraphics[height=12cm,width=23cm]{boys.ps}
  \begin{Scode}{fig=TRUE, height=4}
 data(Oxboys, package = "nlme")
 Oxboys$boy <- gl(26,9) 
 plot(Oxboys$age[Oxboys$boy==1],Oxboys$height[Oxboys$boy==1],ylim=c(125,175),type='b',pch=1,xlab='age',ylab='height')
 for (i in 2:nlevels(Oxboys$Subject)){lines(Oxboys$age[Oxboys$boy==i],Oxboys$height[Oxboys$boy==i], pch=1,type='b',col=i)}
 \end{Scode}
 
 Fig. 4:  Oxford Boys Data %, 
\end{minipage}

 The boys represent the upper level 
 (primary sampling units, PSU), and the particular measurements at different time points 
 correspond to the lower-level units 
 (secondary sampling units, SSU).   Fitting a variance component model with Gaussian quadrature
  (20 mass points), one gets
 %34
 \begin{Scode}
 (Oxboys.g20 <- allvc(height~age,random=~1|boy,data=Oxboys,random.distribution='gq',k=20))
 \end{Scode}
This is no satisfactory solution since fitting the same data with function {\tt lmer} in R package {\bf lme4} gives a disparity of 940.6,  as pointed out by Einbeck, Hinde \& Darnell (2007). It turns out that a huge number of mass points $K \approx 500$ is needed in this example to get down to a similar disparity.  We have observed this phenomenon also at other occasions and it seems to occur only if the intra-class correlation (ICC), given by
\[
 ICC= \frac{\sigma_z^2}{\sigma_z^2+\sigma^2}
\]
is quite large. For example, for the model fitted above it is
%35
\begin{Scode}
Oxboys.g20$rsdev^2/(Oxboys.g20$rsdev^2+ Oxboys.g20$sdev$sdev^2)
\end{Scode}
which is a very large value. We have not observed this problem for smaller ICCs, i.e. roughly $ICC \le 0.5$. Fortunately, the problem does not persist for NPML estimation. For illustration, we fit NPML with seven (as suggested by Aitkin, Hinde \& Francis (2005), p. 495), and eight masspoints,  yielding
%35
 \begin{Scode}
 (Oxboys.np7 <- allvc(height~age,random=~1|boy,data=Oxboys,random.distribution='np',k=7))
 (Oxboys.np8 <- allvc(height~age,random=~1|boy,data=Oxboys,random.distribution='np',k=8)) 
 \end{Scode}
 Thus, NPML with 8 mass points already leads to a better result than GQ with 20 mass points. 
 The EM trajectories, as shown in Fig. 5, can also be
 obtained explicitly by calling
 
 %36
 
 \begin{Scode}{fig=FALSE, echo=TRUE}
 plot(Oxboys.np8, plot.opt=2)
 \end{Scode}
 
  
 We now extend the 8-point model by allowing the linear trend to vary across boys.
%31
 \begin{Scode}
 (Oxboys.np8s <- allvc(height~age,random=~age|boy,data=Oxboys,random.distribution='np',k=8))
 \end{Scode}
 The difference in disparities is
 %37
 \begin{Scode}
  Oxboys.np8$disp-Oxboys.np8s$disp
\end{Scode}
 on 7df, showing clear heterogeneity in the slope.
 
 
 \begin{minipage}{21cm}
%38 
\begin{Scode}{fig=TRUE, echo=FALSE, height=4, width=5}
 plot(Oxboys.np8, plot.opt=2)
 \end{Scode}
 
 Fig. 5: Convergence of EM for eight mass points, applied on Oxford boys data. 
 \end{minipage}
 
 
 \subsection{Spatial random effect models: Irish Suicide Data}
 
 
 The data considered here, available in the package {\bf npmlreg} via
 %39
 \begin{Scode}
 data(irlsuicide)
 \end{Scode}
 describe the mortality due to suicide and intentional self-harm in the Republic of Ireland from 1989--1998.  
 Suicide rates are modelled using 
either the average crude rate or the relative risk as model parameter. The analysis of these data involves  a variance component model 
with regions as cluster variable, categorical covariates for gender and age, interaction terms, 
  and an offset representing the cluster sizes. The R code can be found in the {\tt Examples} section of the help file for {\tt allvc} (page 8 in the reference manual). 
While the random effect accounts for within-region correlation, it is worthwile to consider between-region correlation in this application.
Spatial correlation between regions is included 
 into the model by employing  an extra - fixed or random - covariate representing the average crude 
 suicide rates from the neighboring regions (or the average neighboring 
 standard mortality ratios, respectively). For details, see Sofroniou, Einbeck and Hinde (2006).
 
 
 \section{Citation}

The correct citation for R package {\bf npmlreg} can be queried with
%40
\begin{Scode}
citation(package="npmlreg")
\end{Scode}
\medskip

\noindent The correct citation for this R vignette is 
\medskip
 
\noindent
\begin{description}
\item[EINBECK, J. and HINDE, J.] (2014). Nonparametric maximum
     likelihood estimation for random effect models in R. Vignette to R
     package  {\bf npmlreg}.
\end{description} 
 
\section*{Acknowledgments}
 
The work on this R package was supported by Science
Foundation Ireland Basic Research Grant 04/BR/M0051.
  
\section{References}
 
 \begin{description}
 \item[AITCHISON, J. and AITKEN, C. G. G.] (1976).
Multivariate binary discrimination by kernel method.
{\em Biometrika} {\bf 63}, 413-420.
 \item [AITKIN, M.] (1996a). A general maximum likelihood
analysis of overdispersion in generalized linear models.  {\em
Statistics and Computing} {\bf 6}, 251--262.
\item[AITKIN, M.](1996b). Empirical Bayes shrinkage using
posterior random effect means from nonparametric maximum likelihood
estimation in general random effect models. {\em Statistical
Modelling:  Proceedings of the 11th IWSM 1996}, 87--94.
\item[AITKIN, M.] (1999). A general maximum likelihood analysis of variance components 
in generalized linear models.  {\em Biometrics}  {\bf 55} , 117--128.
\item[AITKIN, M. and FRANCIS, B.] (1995). 
Fitting Overdispersed Generalized Linear Models by Nonparametric Maximum Likelihood.
{\em GLIM Newsletter} {\bf 25 }, 37--45.
\item[AITKIN, M.] (2001). Likelihood and
Bayesian analysis of mixtures. {\em Statistical Modelling} {\bf
1}, 287--304.
\item[AITKIN, M., FRANCIS, B. and HINDE, J. ] (2005). 
{\em Statistical Modelling in GLIM 4} (2nd edition). Oxford, UK.  
\item[EINBECK, J. and HINDE, J.] (2006).
A note on NPML estimation for exponential family regression models with unspecified dispersion parameter.
{\em Austrian Journal of Statistics} {\bf 35}, 233--243. 
\item[EINBECK, J., HINDE, J. and DARNELL, R.] (2007).
A new package for fitting random effect models -- The npmlreg package.  {\em R News} {\bf 7}, 26--30. 
\item[EFRON, B.] (1986). Double exponential families and their use in generalized linear regression.
{\em JASA} {\bf 81}, 709--721.
\item[GOLDSTEIN, H.] (2003). {\em  Multilevel statistical models} (3rd edition). Arnold, London, UK.
\item[HINDE, J. ] (1982).
Compound Poisson Regression Models.
{\em Lecture Notes in Statistics} {\bf 14}, 109--121.
\item[LAIRD, N. M. ] (1978). Nonparametric maximum likelihood estimation of a mixing distribution. {\em JASA}, {\bf 73}, 805--811.
\item[POSTMAN, M., HUCHRA, J. P. and GELLER, M. J.] (1986). Probes of large-scale structures 
in the Corona Borealis region. {\em Astronomical Journal} {\bf 92},1238--1247.
\item[ROSNER, B.] (2000). {\em Fundamentals of
Biostatistics.} Thomson Learning, Duxbury, CA, USA.
\item[SOFRONIOU, N., EINBECK, J. and HINDE, J.] (2006). Analyzing Irish Suicide Rate with Mixture Models.
 In {\em Proceedings of the 21st International Workshop on Statistical Modelling, 3-7/07/06, Galway, Ireland}, 474--481.
\end{description}
\newpage


\section{Appendix: R Documentation}

A printed version of the help files is available in the reference manual, which can be downloaded from CRAN at

\begin{center}
{\tt http://cran.r-project.org/src/contrib/Descriptions/npmlreg.html}.
\end{center}

A list of all functions currently availabe in {\bf npmlreg} is given below:

%39
\begin{Scode}
ls("package:npmlreg")
\end{Scode}

In addition, the data sets \texttt{fabric}, \texttt{irlsucide}, \texttt{hosp}, and \texttt{missouri} are available.
\end{landscape}
\end{document}