% NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is
% likely to be overwritten.
\usepackage{SweaveSlides}
\title[lme4]{Mixed models in R using the lme4 package\\Part 7: Generalized linear mixed models}
\subject{GLMM}
\date[July 3, 2009]{University of Lausanne\\July 3, 2009}
\AtBeginSection[]
{
\begin{frame}
\frametitle{Outline}
\tableofcontents[currentsection]
\end{frame}
}
\begin{document}
\frame{\titlepage}
\begin{frame}
\frametitle{Outline}
\tableofcontents[pausesections,hideallsubsections]
\end{frame}
\SweaveOpts{engine=R,eps=FALSE,pdf=TRUE,width=10,height=6.5,strip.white=all}
\SweaveOpts{prefix=TRUE,prefix.string=figs/GLMM,include=TRUE}
\setkeys{Gin}{width=\textwidth}
<>=
options(width = 70, show.signif.stars = FALSE)
data(Contraception, package = "mlmRev")
library(lattice)
library(Matrix)
library(lme4)
lattice.options(default.theme = function() standard.theme())
@
\section[Definition]{Generalized Linear Mixed Models}
\begin{frame}\frametitle{Generalized Linear Mixed Models}
\begin{itemize}
\item When using linear mixed models (LMMs) we assume that the
response being modeled is on a continuous scale.
\item Sometimes we can bend this assumption a bit if the response is
an ordinal response with a moderate to large number of levels.
For example, the Scottish secondary school test results were
integer values on the scale of 1 to 10.
\item However, an LMM is not suitable for modeling a binary
response, an ordinal response with few levels or a response that
represents a count. For these we use generalized linear mixed
models (GLMMs).
\item To describe GLMMs we return to the representation of the
response as an $n$-dimensional, vector-valued, random variable,
$\bc Y$, and the random effects as a $q$-dimensional,
vector-valued, random variable, $\bm{\mathcal{B}}$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Parts of LMMs carried over to GLMMs}
\begin{itemize}
\item Random variables
\begin{compactitem}
\item $\bc Y$ the response variable
\item $\bm{\mathcal{B}}$ the (possibly correlated) random effects
\item $\bc U$ the orthogonal random effects
\end{compactitem}
\item Parameters
\begin{compactitem}
\item $\bm{\beta}$ - fixed-effects coefficients
\item $\sigma$ - the common scale parameter (not always used)
\item $\bm{\theta}$ - parameters that determine
$\mathrm{Var}(\bc B)=\sigma^2\bm\Lambda\bm\Lambda\trans$
\end{compactitem}
\item Some matrices
\begin{compactitem}
\item $\bm{X}$ the $n\times p$ model matrix for $\bm{\beta}$
\item $\bm{Z}$ the $n\times q$ model matrix for $\bm b$
\item $\bm{P}$ fill-reducing $q\times q$ permutation (from $\bm Z$)
\item $\bm\Lambda(\bm\theta)$ relative covariance factor, s.t. $\text{Var}(\bc B)=\sigma^2\bm\Lambda\bm\Lambda\trans$
\item $\bm U(\bm\theta)=\bm Z\bm\Lambda(\bm\theta)$
\end{compactitem}
\end{itemize}
\end{frame}
\begin{frame}\frametitle{The conditional distribution,
$\bc Y|\bc U$}
\begin{itemize}
\item For GLMMs, the marginal distribution,
$\bm{\mathcal{B}}\sim\mathcal{N}\left(\bm
0,\bm\Sigma(\bm\theta)\right)$ is the same as in LMMs except
that $\sigma^2$ is omitted. We define
$\bc U\sim\mathcal{N}(\bm 0,\bm I_q)$ such that
$\bm{\mathcal{B}}=\bm\Lambda(\bm\theta)\bc U$.
\item For GLMMs we retain some of the properties of the conditional
distribution
\begin{displaymath}
(\bc Y|\bc U=\bm u)\sim
\mathcal{N}\left(\bm\mu_{\bc Y|\bc U},\sigma^2\bm
I\right)\text{ where }
\bm\mu_{\bc Y|\bc U}(\bm u)=
\bm X\bm\beta+\bm Z\bm\Lambda\bm u
\end{displaymath}
Specifically
\begin{itemize}
\item The distribution $\bc Y|\bm{\mathcal{U}=\bm u}$ depends
on $\bm u$ only through the conditional mean,
$\bm\mu_{\bc Y|\bc U}(\bm u)$.
\item Elements of $\bc Y$ are \Emph{conditionally
independent}. That is, the distribution of
$\bc Y|\bm{\mathcal{U}=\bm u}$ is completely
specified by the univariate, conditional distributions,
$\mathcal{Y}_i|\bc U,i=1,\dots,n$.
\item These univariate, conditional distributions all have the same
form. They differ only in their means.
\end{itemize}
\item GLMMs differ from LMMs in the form of the univariate,
conditional distributions and in how
$\bm\mu_{\bc Y|\bc U}(\bm u)$ depends on
$\bm u$.
\end{itemize}
\end{frame}
\section[Links]{Specific distributions and links}
\begin{frame}
\frametitle{Some choices of univariate conditional distributions}
\begin{itemize}
\item Typical choices of univariate conditional distributions are:
\begin{itemize}
\item The \Emph{Bernoulli} distribution for binary (0/1) data, which has
probability mass function
\begin{displaymath}
p(y|\mu)= \mu^{y}(1-\mu)^{1 - y},\quad 0<\mu< 1,\quad y = 0,1
\end{displaymath}
\item Several independent binary responses can be represented as a
\Emph{binomial} response, but only if all the Bernoulli
distributions have the same mean.
\item The \Emph{Poisson} distribution for count ($0,1,\dots$)
data, which has probability mass function
\begin{displaymath}
p(y|\mu)= e^{-\mu}\frac{\mu^{y}}{y!},\quad 0<\mu,\quad y = 0,1,2,\dots
\end{displaymath}
\end{itemize}
\item All of these distributions are completely specified by the
conditional mean. This is different from the conditional normal
(or Gaussian) distribution, which also requires the common scale
parameter, $\sigma$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The link function, g}
\begin{itemize}
\item When the univariate conditional distributions have constraints
on $\mu$, such as $0<\mu<1$ (Bernoulli) or $0<\mu$ (Poisson), we
cannot define the conditional mean,
$\bm\mu_{\bc Y|\bc U}$, to be equal to the
linear predictor, $\bm X\bm\beta+\bm U(\bm\theta)\bm u$,
which is unbounded.
\item We choose an invertible, univariate \Emph{link function}, $g$,
such that $\eta=g(\mu)$ is unconstrained. The vector-valued link
function, $\bm g$, is defined by applying $g$ component-wise.
\begin{displaymath}
\bm\eta=\bm g(\bm\mu)\quad\text{where}\quad
\eta_i=g(\mu_i),\quad i=1,\dots,n
\end{displaymath}
\item We require that $g$ be invertible so that $\mu=g^{-1}(\eta)$
is defined for $-\infty<\eta<\infty$ and is in the appropriate
range ($0<\mu<1$ for the Bernoulli or $0<\mu$ for the
Poisson). The vector-valued inverse link, $\bm g^{-1}$, is defined
component-wise.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{``Canonical'' link functions}
\begin{itemize}
\item There are many choices of invertible scalar link functions, $g$,
that we could use for a given set of constraints.
\item For the Bernoulli and Poisson distributions, however, one link
function arises naturally from the definition of the probability
mass function. (The same is true for a few other, related but
less frequently used, distributions, such as the gamma distribution.)
\item To derive the canonical link, we consider the logarithm of the
probability mass function (or, for continuous distributions, the
probability density function).
\item For distributions in this ``exponential'' family, the
logarithm of the probability mass or density can be written as a
sum of terms, some of which depend on the response, $y$, only and
some of which depend on the mean, $\mu$, only. However, only one
term depends on \textbf{both} $y$ and $\mu$, and this term has the
form $y\cdot g(\mu)$, where $g$ is the canonical link.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The canonical link for the Bernoulli distribution}
\begin{itemize}
\item The logarithm of the probability mass function is
\begin{displaymath}
\log(p(y|\mu))=\log(1-\mu)+y\log\left(\frac{\mu}{1-\mu}\right),
\;0<\mu<1,\;y=0,1 .
\end{displaymath}
\item Thus, the canonical link function is the
\Emph{logit} link
\begin{displaymath}
\eta=g(\mu)=\log\left(\frac{\mu}{1-\mu}\right).
\end{displaymath}
\item Because $\mu=P[\mathcal{Y} = 1]$, the quantity
$\mu/(1-\mu)$ is the odds ratio (in the range $(0, \infty)$)
and $g$ is the logarithm of the odds ratio, sometimes called ``log
odds''.
\item The inverse link is
\begin{displaymath}
\mu=g^{-1}(\eta)=\frac{e^\eta}{1+e^\eta}=\frac{1}{1+e^{-\eta}}
\end{displaymath}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Plot of canonical link for the Bernoulli distribution}
<>=
logit <- function(mu) {
mu <- pmax(.Machine$double.eps, pmin(1-.Machine$double.eps, mu))
log(mu/(1-mu))
}
mu <- seq(0.001, 0.999, len = 999)
print(xyplot(logit(mu) ~ mu, type = c("g", "l"),
xlab = expression(mu),
ylab = expression(eta == log(frac(mu, 1-mu)))))
@
\end{frame}
\begin{frame}
\frametitle{Plot of inverse canonical link for the Bernoulli distribution}
<>=
linkinv <- function(eta) 1/(1+exp(-eta))
eta <- seq(-7,7,len = 701)
print(xyplot(linkinv(eta) ~ eta, type = c("g","l"),
xlab = expression(eta),
ylab = expression(mu == frac(1,1+exp(-eta)))))
@
\end{frame}
\begin{frame}
\frametitle{The canonical link for the Poisson distribution}
\begin{itemize}
\item The logarithm of the probability mass is
\begin{displaymath}
\log(p(y|\mu))=\log(y!)-\mu+y\log(\mu)
\end{displaymath}
\item Thus, the canonical link function for the Poisson is the
\Emph{log} link
\begin{displaymath}
\eta=g(\mu)=\log(\mu)
\end{displaymath}
\item The inverse link is
\begin{displaymath}
\mu=g^{-1}(\eta)=e^{\eta}
\end{displaymath}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The canonical link related to the variance}
\begin{itemize}
\item For the canonical link function, the derivative of its inverse
is the variance of the response.
\item For the Bernoulli, the canonical link is the logit and the
inverse link is $\mu=g^{-1}(\eta)=1/(1+e^{-\eta})$. Then
\begin{displaymath}
\frac{d\mu}{d\eta}=\frac{e^{-\eta}}{(1+e^{-\eta})^2}=
\frac{1}{1+e^{-\eta}}\frac{e^{-\eta}}{1+e^{-\eta}}=\mu(1-\mu)=
\mathrm{Var}(\mathcal{Y})
\end{displaymath}
\item For the Poisson, the canonical link is the log and the inverse
link is $\mu=g^{-1}(\eta)=e^\eta$. Then
\begin{displaymath}
\frac{d\mu}{d\eta}=e^\eta=\mu=\mathrm{Var}(\mathcal{Y})
\end{displaymath}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The unscaled conditional density of
$\bc U|\bc Y=\bm y$}
\begin{itemize}
\item As in LMMs we evaluate the likelihood of the parameters, given
the data, as
\begin{displaymath}
L(\bm\theta,\bm\beta|\bm y)=
\int_{\mathbb{R}^q}[\bc Y|\bc U](\bm y|\bm u)
\,[\bc U](\bm u)\, d\bm u ,
\end{displaymath}
\item The product $[\bc Y|\bc U](\bm y|\bm u)
[\bc U](\bm u)$ is the unscaled (or
\Emph{unnormalized}) density of the conditional distribution
$\bc U|\bc Y$.
\item The density $[\bc U](\bm u)$ is a spherical
Gaussian density $\frac{1}{(2\pi)^{q/2}} e^{-\|\bm u\|^2/2}$.
\item The expression $[\bc Y|\bc U](\bm y|\bm
u)$ is the value of a probability mass function or a probability
density function, depending on whether
$\mathcal{Y}_i|\bc U$ is discrete or continuous.
\item The linear predictor is
$\bm g(\bm\mu_{\bc Y|\bc U})=\bm\eta=\bm
X\bm\beta+\bm U(\bm\theta)\bm u$.
Alternatively, we can write the conditional mean of
$\bc Y$, given $\bc U$, as
\begin{displaymath}
\bm\mu_{\bc Y|\bc U}(\bm u)= \bm g^{-1}\left(\bm X\bm\beta+ \bm U(\bm\theta)\bm u\right)
\end{displaymath}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The conditional mode of $\bc U|\bc Y=\bm y$}
\begin{itemize}
\item In general the likelihood, $L(\bm\theta,\bm\beta|\bm y)$ does
not have a closed form. To approximate this value, we first determine
the \Emph{conditional mode}
\begin{displaymath}
\tilde{\bm u}(\bm y|\bm\theta,\bm\beta)=\arg\max_{\bm u}
[\bc Y|\bc U](\bm y|\bm u)\,
[\bc U](\bm u)
\end{displaymath}
using a quadratic approximation to the logarithm of the
unscaled conditional density.
\item This optimization problem is (relatively) easy because the
quadratic approximation to the logarithm of the unscaled
conditional density can be written as a penalized, weighted
residual sum of squares,
\begin{displaymath}
\tilde{\bm u}(\bm y|\bm\theta,\bm\beta)=\arg\min_{\bm u}\left\|
\begin{bmatrix}
\bm W^{1/2}(\bm\mu)\left(\bm y -
\bm\mu_{\bc Y|\bc U}(\bm u)\right)\\
-\bm u
\end{bmatrix}\right\|^2
\end{displaymath}
where $\bm W(\bm\mu)$ is the diagonal weights matrix. The weights
are the inverses of the variances of the $\mathcal{Y}_i$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The PIRLS algorithm}
\begin{itemize}
\item Parameter estimates for generalized linear models (without
random effects) are usually determined by iteratively reweighted
least squares (IRLS), an incredibly efficient algorithm. PIRLS is
the penalized version. It is iteratively reweighted in the
sense that parameter estimates are determined for a fixed weights
matrix $\bm W$ then the weights are updated to the current
estimates and the process
repeated.
\item For fixed weights we solve
\begin{displaymath}
\min_{\bm u}\left\|
\begin{bmatrix}
\bm W^{1/2}\left(\bm y -
\bm\mu_{\bc Y|\bc U}(\bm u)\right)\\
-\bm u
\end{bmatrix}\right\|^2
\end{displaymath}
as a nonlinear least squares problem with update, $\bm\delta_{\bm
u}$, given by
\begin{displaymath}
\bm P\left(\bm U\bm M\bm W\bm M\bm U\trans+\bm I\right)\bm P\trans
\bm\delta_{\bm u}=\bm U\bm M\bm W(\bm y-\bm\mu) - \bm u
\end{displaymath}
where $\bm M=d\bm\mu/d\bm\eta$ is the (diagonal) Jacobian matrix.
Recall that for the canonical link, $\bm M=
\text{Var}(\bc Y|\bc U)=\bm W^{-1}$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The Laplace approximation to the deviance}
\begin{itemize}
\item At convergence, the sparse Cholesky factor, $\bm L$, used to
evaluate the update is
\begin{displaymath}
\bm L\bm L\trans =
\bm P\left(\bm U\bm M\bm W\bm M\bm U\trans+\bm I\right)\bm P\trans
\end{displaymath}
or
\begin{displaymath}
\bm L\bm L\trans =
\bm P\left(\bm U\bm M\bm U\trans+\bm I\right)\bm P\trans
\end{displaymath}
if we are using the canonical link.
\item The integrand of the likelihood is approximately a constant
times the density of the $\mathcal{N}(\tilde{\bm u},\bm L\bm
L\trans)$ distribution.
\item On the deviance scale (negative twice the log-likelihood) this
corresponds to
\begin{displaymath}
d(\bm\beta,\bm\theta|\bm y)=d_g(\bm y,\bm\mu(\tilde{\bm u}))+
\|\tilde{\bm u}\|^2+\log(|\bm L|^2)
\end{displaymath}
where $d_g(\bm y,\bm\mu(\tilde{\bm u}))$ is the GLM deviance for
$\bm y$ and $\bm\mu$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Modifications to the algorithm}
\begin{itemize}
\item Notice that this deviance depends on the fixed-effects
parameters, $\bm\beta$, as well as the variance-component
parameters, $\bm\theta$. This is because $\log(|\bm L|^2)$
depends on $\bm\mu_{\bc Y|\bc U}$ and,
hence, on $\bm\beta$. For LMMs $\log(|\bm L|^2)$ depends only on
$\bm\theta$.
\item It is likely that modifying the PIRLS algorithm to optimize
simultaneously on $\bm u$ and $\bm\beta$ would result in a value
that is very close to the deviance profiled over $\bm\beta$.
\item Another approach, which is being implemented as a Google
Summer of Code project, is adaptive Gauss-Hermite quadrature
(AGQ). This has a similar structure to the Laplace approximation
but is based on more evaluations of the unscaled conditional
density near the conditional modes. It is only appropriate for
models in which the random effects are associated with only one
grouping factor
\end{itemize}
\end{frame}
\section[Example]{Data description and initial exploration}
\begin{frame}[fragile]\frametitle{Contraception data}
\begin{itemize}
\item One of the data sets in the \code{"mlmRev"} package, derived
from data files available on the multilevel modelling web site, is
from a fertility survey of women in Bangladesh.
\item One of the responses is whether or not the woman currently
uses artificial contraception (i.e. a binary response)
\item Covariates included the woman's age (on a centered scale),
the number of live children she had, whether she lived in an urban
or rural setting, and the district in which she lived.
\item Instead of plotting such data as points, we use the 0/1
response to generate scatterplot smoother curves versus age for
the different groups.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Contraception use versus age by urban and livch}
\begin{center}
<>=
print(xyplot(ifelse(use == "Y", 1, 0) ~ age|urban, Contraception,
groups = livch, type = c("g", "smooth"),
auto.key = list(space = "top", points = FALSE,
lines = TRUE, columns = 4),
ylab = "Proportion", xlab = "Centered age"))
@
\end{center}
\end{frame}
\begin{frame}\frametitle{Comments on the data plot}
\begin{itemize}
\item These observational data are unbalanced (some districts have
only $2$ observations, some have nearly $120$). They are not
longitudinal (no ``time'' variable).
\item Binary responses have low per-observation information content
(exactly one bit per observation). Districts with few
observations will not contribute strongly to estimates of random effects.
\item Within-district plots will be too imprecise so we only examine
the global effects in plots.
\item The comparisons on the multilevel modelling site are for
fits of a model that is linear in \code{age}, which is clearly
inappropriate.
\item The form of the curves suggests at least a quadratic in
\code{age}.
\item The urban versus rural differences may be additive.
\item It appears that the \code{livch} factor could be dichotomized
into ``0'' versus ``1 or more''.
\end{itemize}
\end{frame}
\subsection{Fitting a preliminary model}
\begin{frame}[fragile]
\frametitle{Preliminary model using Laplacian approximation}
\begin{center}
<>=
print(cm1 <- lmer(use ~ age + I(age^2) + urban + livch + (1|district),
Contraception, binomial), corr = FALSE)
@
\end{center}
\end{frame}
\begin{frame}\frametitle{Comments on the model fit}
\begin{itemize}
\item This model was fit using the Laplacian approximation to the
deviance.
\item There is a highly significant quadratic term in \code{age}.
\item The linear term in \code{age} is not significant but we retain
it because the \code{age} scale has been centered at an arbitrary
(and unknown) value.
\item The \code{urban} factor is highly significant (as indicated by
the plot).
\item Levels of \code{livch} greater than 0 are significantly
different from 0 but may not be different from each other.
\end{itemize}
\end{frame}
\section{Model building}
<>=
Contraception$ch <- factor(Contraception$livch != 0, labels = c("N","Y"))
@
\begin{frame}[fragile]
\frametitle{Reduced model with dichotomized livch}
<>=
print(cm2 <- lmer(use ~ age + I(age^2) + urban + ch + (1|district),
Contraception, binomial), corr = FALSE)
@
\end{frame}
\begin{frame}[fragile]\frametitle{Comparing the model fits}
\begin{itemize}
\item A likelihood ratio test can be used to compare these nested models.
\end{itemize}
<>=
anova(cm2, cm1)
@
\begin{itemize}
\item The large p-value indicates that we would not reject \code{cm2}
in favor of \code{cm1} hence we prefer the more parsimonious \code{cm2}.
\item The plot of the scatterplot smoothers according to live children
or none indicates that there may be a difference in the age pattern
between these two groups.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Contraception use versus age by urban and ch}
\begin{center}
<>=
print(xyplot(ifelse(use == "Y", 1, 0) ~ age|urban, Contraception,
groups = ch, type = c("g", "smooth"),
auto.key = list(space = "top", points = FALSE,
lines = TRUE, columns = 2),
ylab = "Proportion", xlab = "Centered age"))
@
\end{center}
\end{frame}
\begin{frame}[fragile]
\frametitle{Allowing age pattern to vary with ch}
<>=
print(cm3 <- glmer(use ~ age*ch + I(age^2) + urban + (1|district),
Contraception, binomial), corr = FALSE)
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Prediction intervals on the random effects}
\begin{center}
<>=
print(qqmath(ranef(cm3, post = TRUE))[[1]])
@
\end{center}
\end{frame}
\begin{frame}[fragile]\frametitle{Extending the random effects}
\begin{itemize}
\item We may want to consider allowing a random effect for
urban/rural by district. This is complicated by the fact the many
districts only have rural women in the study
\end{itemize}
<>=
cat(head(capture.output(xtabs(~urban+district, Contraception)),7),sep='\n')
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Including a random effect for urban by district}
<>=
(cm4 <- glmer(use ~ age*ch + I(age^2) + urban + (urban|district),
Contraception, binomial))
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Significance of the additional random effect}
<>=
anova(cm4,cm3)
@
\begin{itemize}
\item The additional random effect is highly significant in this test.
\item Most of the prediction intervals still overlap zero.
\item A scatterplot of the random effects shows several random effects
vectors falling along a straight line. These are the districts with
all rural women or all urban women.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Prediction intervals for the bivariate random effects}
\begin{center}
<>=
print(qqmath(ranef(cm4, post = TRUE))$district)
@
\end{center}
\end{frame}
\begin{frame}[fragile]
\frametitle{Scatter plot of the BLUPs}
<>=
print(plot(ranef(cm4), type = c("g","p"), aspect = 1)$district)
@
\end{frame}
\section[Conclusions]{Conclusions from the example}
\begin{frame}\frametitle{Conclusions from the example}
\begin{itemize}
\item Again, carefully plotting the data is enormously helpful in
formulating the model.
\item Observational data tend to be unbalanced and have many more
covariates than data from a designed experiment. Formulating a
model is typically more difficult than in a designed experiment.
\item A generalized linear model is fit by adding a value, typically
\code{binomial} or \code{poisson}, for the optional argument
\code{family} in the call to \code{lmer}.
\item MCMC sampling is not provided for GLMMs at present but will be
added.
\item We use likelihood-ratio tests and z-tests in the model
building.
\end{itemize}
\end{frame}